From 90f7241b59b254fd02cb523e8ff9ca06cb85f268 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 15 Sep 2006 23:52:26 +0000 Subject: [PATCH] serverByteBuffer.trim() can now recognize utf-8 characters git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2602 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/server/serverByteBuffer.java | 38 ++++++++++++++++++- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java index 08beaf5d0..2a9d6cbbc 100644 --- a/source/de/anomic/server/serverByteBuffer.java +++ b/source/de/anomic/server/serverByteBuffer.java @@ -294,11 +294,45 @@ public final class serverByteBuffer extends OutputStream { } public serverByteBuffer trim() { - int l = 0; while ((l < length) && (buffer[offset + l] <= 32)) l++; - int r = length; while ((r > 0) && (buffer[offset + r - 1] <= 32)) r--; + int l = 0; + while ((l < length) && (buffer[offset + l] <= 32)) l++; + int r = length; + int u; + while ((r > 0) && (buffer[offset + r - 1] <= 32)) { + u = isUTF8char(r - 1); + if (u > 0) { + r += u - 1; + break; + } + r--; + } if (l > r) r = l; return trim(l, r); } + + public int isUTF8char(int start) { + // a sequence of bytes is a utf-8 character, if one of the following 4 conditions is true: + // - ASCII equivalence range; (first) byte begins with zero + // - first byte begins with 110, the following byte begins with 10 + // - first byte begins with 1110, the following two bytes begin with 10 + // - First byte begins with 11110, the following three bytes begin with 10 + // if an utf-8 sequence is detected, the length of the sequence is returned. -1 othervise + if ((start < length) && + ((buffer[offset + start] & 0x80) != 0)) return 1; + if ((start < length - 1) && + ((buffer[offset + start ] & 0xF0) == 0xC0) && + ((buffer[offset + start + 1] & 0xF0) == 0x80)) return 2; + if ((start < length - 2) && + ((buffer[offset + start ] & 0xF0) == 0xE0) && + ((buffer[offset + start + 1] & 0xF0) == 0x80) && + ((buffer[offset + start + 2] & 0xF0) == 0x80)) return 3; + if ((start < length - 3) && + ((buffer[offset + start ] & 0xF8) == 0xF0) && + ((buffer[offset + start + 1] & 0xF0) == 0x80) && + ((buffer[offset + start + 2] & 0xF0) == 0x80) && + ((buffer[offset + start + 3] & 0xF0) == 0x80)) return 4; + return -1; + } public boolean isWhitespace(boolean includeNonLetterBytes) { // returns true, if trim() would result in an empty serverByteBuffer