better handling of whitespace

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2311 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 4149939c02
commit b3f7e62e03

@ -143,7 +143,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
serverByteBuffer[] sbbs = httpTemplate.splitQuotations(sbb);
sbb = new serverByteBuffer();
for (int i = 0; i < sbbs.length; i++) {
if (sbbs[i].isWhitespace()) {
if (sbbs[i].isWhitespace(true)) {
sbb.append(sbbs[i]);
} else if ((sbbs[i].byteAt(0) == httpTemplate.hash) ||
(sbbs[i].startsWith(httpTemplate.dpdpa))) {
@ -151,8 +151,8 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
sbb.append(sbbs[i]);
} else {
// this is a text fragment, generate gettext quotation
int ws = sbbs[i].whitespaceStart();
int we = sbbs[i].whitespaceEnd();
int ws = sbbs[i].whitespaceStart(true);
int we = sbbs[i].whitespaceEnd(true);
sbb.append(sbbs[i].getBytes(0, ws));
sbb.append('_');
sbb.append('(');

@ -290,26 +290,44 @@ public final class serverByteBuffer extends OutputStream {
return trim(l, r);
}
public boolean isWhitespace() {
public boolean isWhitespace(boolean includeNonLetterBytes) {
// returns true, if trim() would result in an empty serverByteBuffer
for (int i = 0; i < length; i++) {
if (buffer[offset + i] > 32) return false;
if (includeNonLetterBytes) {
byte b;
for (int i = 0; i < length; i++) {
b = buffer[offset + i];
if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) return false;
}
} else {
for (int i = 0; i < length; i++) if (buffer[offset + i] > 32) return false;
}
return true;
}
public int whitespaceStart() {
public int whitespaceStart(boolean includeNonLetterBytes) {
// returns number of whitespace bytes at the beginning of text
for (int i = 0; i < length; i++) {
if (buffer[offset + i] > 32) return i;
if (includeNonLetterBytes) {
byte b;
for (int i = 0; i < length; i++) {
b = buffer[offset + i];
if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) return i;
}
} else {
for (int i = 0; i < length; i++) if (buffer[offset + i] > 32) return i;
}
return length;
}
public int whitespaceEnd() {
public int whitespaceEnd(boolean includeNonLetterBytes) {
// returns position of whitespace at the end of text
for (int i = length - 1; i >= 0; i--) {
if (buffer[offset + i] > 32) return i + 1;
if (includeNonLetterBytes) {
byte b;
for (int i = length - 1; i >= 0; i--) {
b = buffer[offset + i];
if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) return i + 1;
}
} else {
for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] > 32) return i + 1;
}
return 0;
}

Loading…
Cancel
Save