better handling of whitespace

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2311 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 4149939c02
commit b3f7e62e03

@ -143,7 +143,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
serverByteBuffer[] sbbs = httpTemplate.splitQuotations(sbb); serverByteBuffer[] sbbs = httpTemplate.splitQuotations(sbb);
sbb = new serverByteBuffer(); sbb = new serverByteBuffer();
for (int i = 0; i < sbbs.length; i++) { for (int i = 0; i < sbbs.length; i++) {
if (sbbs[i].isWhitespace()) { if (sbbs[i].isWhitespace(true)) {
sbb.append(sbbs[i]); sbb.append(sbbs[i]);
} else if ((sbbs[i].byteAt(0) == httpTemplate.hash) || } else if ((sbbs[i].byteAt(0) == httpTemplate.hash) ||
(sbbs[i].startsWith(httpTemplate.dpdpa))) { (sbbs[i].startsWith(httpTemplate.dpdpa))) {
@ -151,8 +151,8 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
sbb.append(sbbs[i]); sbb.append(sbbs[i]);
} else { } else {
// this is a text fragment, generate gettext quotation // this is a text fragment, generate gettext quotation
int ws = sbbs[i].whitespaceStart(); int ws = sbbs[i].whitespaceStart(true);
int we = sbbs[i].whitespaceEnd(); int we = sbbs[i].whitespaceEnd(true);
sbb.append(sbbs[i].getBytes(0, ws)); sbb.append(sbbs[i].getBytes(0, ws));
sbb.append('_'); sbb.append('_');
sbb.append('('); sbb.append('(');

@ -290,26 +290,44 @@ public final class serverByteBuffer extends OutputStream {
return trim(l, r); return trim(l, r);
} }
public boolean isWhitespace() { public boolean isWhitespace(boolean includeNonLetterBytes) {
// returns true, if trim() would result in an empty serverByteBuffer // returns true, if trim() would result in an empty serverByteBuffer
for (int i = 0; i < length; i++) { if (includeNonLetterBytes) {
if (buffer[offset + i] > 32) return false; byte b;
for (int i = 0; i < length; i++) {
b = buffer[offset + i];
if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) return false;
}
} else {
for (int i = 0; i < length; i++) if (buffer[offset + i] > 32) return false;
} }
return true; return true;
} }
public int whitespaceStart() { public int whitespaceStart(boolean includeNonLetterBytes) {
// returns number of whitespace bytes at the beginning of text // returns number of whitespace bytes at the beginning of text
for (int i = 0; i < length; i++) { if (includeNonLetterBytes) {
if (buffer[offset + i] > 32) return i; byte b;
for (int i = 0; i < length; i++) {
b = buffer[offset + i];
if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) return i;
}
} else {
for (int i = 0; i < length; i++) if (buffer[offset + i] > 32) return i;
} }
return length; return length;
} }
public int whitespaceEnd() { public int whitespaceEnd(boolean includeNonLetterBytes) {
// returns position of whitespace at the end of text // returns position of whitespace at the end of text
for (int i = length - 1; i >= 0; i--) { if (includeNonLetterBytes) {
if (buffer[offset + i] > 32) return i + 1; byte b;
for (int i = length - 1; i >= 0; i--) {
b = buffer[offset + i];
if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) return i + 1;
}
} else {
for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] > 32) return i + 1;
} }
return 0; return 0;
} }

Loading…
Cancel
Save