From 9544c47684976072e28829e4f4efdefdeaaf82e0 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 10 Jan 2006 16:48:59 +0000 Subject: [PATCH] added some UTF-8 handling. hope this will help somehow.. for shure not THE solution to our UTF-8 problem git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1308 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/MessageSend_p.java | 9 +++++- htroot/Wiki.java | 16 ++++++---- htroot/htdocsdefault/dir.java | 6 ++-- htroot/yacy/message.java | 9 +++++- source/dbtest.java | 1 + source/de/anomic/data/wikiBoard.java | 8 ++--- source/de/anomic/data/wikiCode.java | 7 +++- .../htmlFilter/htmlFilterAbstractScraper.java | 2 +- .../htmlFilter/htmlFilterContentScraper.java | 2 +- .../htmlFilterContentTransformer.java | 8 ++++- source/de/anomic/http/httpd.java | 2 +- .../anomic/kelondro/kelondroAbstractRA.java | 2 +- source/de/anomic/kelondro/kelondroArray.java | 32 +++++++++++++++++-- .../anomic/kelondro/kelondroBase64Order.java | 21 +++++++++--- .../de/anomic/kelondro/kelondroHashtable.java | 2 +- .../anomic/kelondro/kelondroNaturalOrder.java | 4 +++ .../de/anomic/kelondro/kelondroRecords.java | 2 +- .../anomic/plasma/parser/odt/odtParser.java | 4 +-- .../anomic/plasma/parser/pdf/pdfParser.java | 2 +- source/de/anomic/plasma/plasmaCrawlEURL.java | 18 +++++------ source/de/anomic/plasma/plasmaCrawlLURL.java | 22 ++++++------- source/de/anomic/plasma/plasmaCrawlNURL.java | 14 ++++---- .../de/anomic/plasma/plasmaCrawlStacker.java | 20 ++++++------ .../anomic/plasma/plasmaSwitchboardQueue.java | 16 +++++----- .../anomic/plasma/plasmaWordConnotation.java | 2 +- .../plasma/plasmaWordIndexAssortment.java | 3 +- .../anomic/plasma/plasmaWordIndexCache.java | 4 +-- source/de/anomic/server/serverFileUtils.java | 2 +- source/de/anomic/soap/httpdSoapHandler.java | 2 +- source/de/anomic/soap/httpdSoapService.java | 2 +- source/de/anomic/tools/cryptbig.java | 2 +- source/de/anomic/tools/gzip.java | 2 +- source/de/anomic/yacy/yacyClient.java | 8 +++-- source/yacy.java | 2 +- 34 files changed, 168 insertions(+), 90 deletions(-) diff --git a/htroot/MessageSend_p.java b/htroot/MessageSend_p.java index 0f83a3bef..f597b2572 100644 --- a/htroot/MessageSend_p.java +++ b/htroot/MessageSend_p.java @@ -44,6 +44,7 @@ // javac -classpath .:../Classes MessageSend_p.java // if the shell's current path is HTROOT +import java.io.UnsupportedEncodingException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; @@ -128,7 +129,13 @@ public class MessageSend_p { if (messagesize < 1000) messagesize = 1000; // debug if (subject.length() > 100) subject = subject.substring(0, 100); if (message.length() > messagesize) message = message.substring(0, messagesize); - HashMap result = yacyClient.postMessage(hash, subject, message.getBytes()); + byte[] mb; + try { + mb = message.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + mb = message.getBytes(); + } + HashMap result = yacyClient.postMessage(hash, subject, mb); body += "

Your message has been sent. The target peer responded:

"; body += "

" + result.get("response") + "

"; } catch (NumberFormatException e) { diff --git a/htroot/Wiki.java b/htroot/Wiki.java index 9191d6ef5..3d9853ffa 100644 --- a/htroot/Wiki.java +++ b/htroot/Wiki.java @@ -73,7 +73,7 @@ public class Wiki { } - public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws IOException { plasmaSwitchboard switchboard = (plasmaSwitchboard) env; serverObjects prop = new serverObjects(); if (post == null) { @@ -94,11 +94,15 @@ public class Wiki { } } - if (post.containsKey("submit")) { - // store a new page - switchboard.wikiDB.write(switchboard.wikiDB.newEntry(pagename, author, ip, - post.get("reason", "edit"), - post.get("content", "").getBytes())); + if (post.containsKey("submit")) { + // store a new page + byte[] content; + try { + content = post.get("content", "").getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + content = post.get("content", "").getBytes(); + } + switchboard.wikiDB.write(switchboard.wikiDB.newEntry(pagename, author, ip, post.get("reason", "edit"), content)); // create a news message HashMap map = new HashMap(); map.put("page", pagename); diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index b7a88c667..84abf9060 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -173,7 +173,7 @@ public class dir { try { serverFileUtils.write(binary, newfile); String md5s = serverCodings.encodeMD5Hex(newfile); - serverFileUtils.write((md5s + "\n" + description).getBytes(), newfilemd5); // generate md5 + serverFileUtils.write((md5s + "\n" + description).getBytes("UTF-8"), newfilemd5); // generate md5 // index file info if (post.get("indexing", "").equals("on")) { @@ -262,7 +262,7 @@ public class dir { // generate md5 on-the-fly md5s = serverCodings.encodeMD5Hex(f); description = ""; - serverFileUtils.write((md5s + "\n" + description).getBytes(), fmd5); + serverFileUtils.write((md5s + "\n" + description).getBytes("UTF-8"), fmd5); } } catch (IOException e) { md5s = ""; @@ -478,7 +478,7 @@ public class dir { public static void deletePhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) { try { final String urlhash = plasmaURL.urlHash(new URL(urlstring)); - final Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes()); + final Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8")); switchboard.removeReferences(urlhash, words); switchboard.urlPool.loadedURL.remove(urlhash); } catch (Exception e) { diff --git a/htroot/yacy/message.java b/htroot/yacy/message.java index 869c684d8..0a92208c5 100644 --- a/htroot/yacy/message.java +++ b/htroot/yacy/message.java @@ -49,6 +49,7 @@ import java.io.File; import java.io.IOException; import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; import java.text.SimpleDateFormat; import java.util.Date; import de.anomic.data.messageBoard; @@ -125,11 +126,17 @@ public final class message { // save message messageBoard.entry msgEntry = null; + byte[] mb; + try { + mb = message.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + mb = message.getBytes(); + } sb.messageDB.write(msgEntry = sb.messageDB.newEntry( "remote", otherSeed.get(yacySeed.NAME, "anonymous"), otherSeed.hash, yacyCore.seedDB.mySeed.getName(), yacyCore.seedDB.mySeed.hash, - subject, message.getBytes())); + subject, mb)); messageForwardingViaEmail(ss, msgEntry); diff --git a/source/dbtest.java b/source/dbtest.java index b87ccb594..aae0e1f33 100644 --- a/source/dbtest.java +++ b/source/dbtest.java @@ -24,6 +24,7 @@ public class dbtest { public final static int keylength = 12; public final static int valuelength = 223; // sum of all data length as defined in plasmaURL + //public final static long buffer = 0; public final static long buffer = 8192 * 1024; // 8 MB buffer public static byte[] dummyvalue1 = new byte[valuelength]; public static byte[] dummyvalue2 = new byte[valuelength]; diff --git a/source/de/anomic/data/wikiBoard.java b/source/de/anomic/data/wikiBoard.java index c0390a1c8..29ce2aacc 100644 --- a/source/de/anomic/data/wikiBoard.java +++ b/source/de/anomic/data/wikiBoard.java @@ -149,7 +149,7 @@ public class wikiBoard { return author; } - public entry newEntry(String subject, String author, String ip, String reason, byte[] page) { + public entry newEntry(String subject, String author, String ip, String reason, byte[] page) throws IOException { return new entry(normalize(subject), author, ip, reason, page); } @@ -158,17 +158,17 @@ public class wikiBoard { String key; Map record; - public entry(String subject, String author, String ip, String reason, byte[] page) { + public entry(String subject, String author, String ip, String reason, byte[] page) throws IOException { record = new HashMap(); key = subject; if (key.length() > keyLength) key = key.substring(0, keyLength); record.put("date", dateString()); if ((author == null) || (author.length() == 0)) author = "anonymous"; - record.put("author", kelondroBase64Order.enhancedCoder.encode(author.getBytes())); + record.put("author", kelondroBase64Order.enhancedCoder.encode(author.getBytes("UTF-8"))); if ((ip == null) || (ip.length() == 0)) ip = ""; record.put("ip", ip); if ((reason == null) || (reason.length() == 0)) reason = ""; - record.put("reason", kelondroBase64Order.enhancedCoder.encode(reason.getBytes())); + record.put("reason", kelondroBase64Order.enhancedCoder.encode(reason.getBytes("UTF-8"))); if (page == null) record.put("page", ""); else diff --git a/source/de/anomic/data/wikiCode.java b/source/de/anomic/data/wikiCode.java index 247f2cc2a..59a2d73d1 100644 --- a/source/de/anomic/data/wikiCode.java +++ b/source/de/anomic/data/wikiCode.java @@ -50,6 +50,7 @@ import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; @@ -73,7 +74,11 @@ public class wikiCode { } public String transform(String content){ - return transform(content.getBytes(), sb); + try { + return transform(content.getBytes("UTF-8"), sb); + } catch (UnsupportedEncodingException e) { + return transform(content.getBytes(), sb); + } } public String transform(byte[] content){ return transform(content, sb); diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index 584d5605d..7e92d08d8 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -1 +1 @@ -// htmlFilterAbstractScraper.java // --------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last major change: 18.02.2004 // // You agree that the Author(s) is (are) not responsible for cost, // loss of data or any harm that may be caused by usage of this softare or // this documentation. The usage of this software is on your own risk. The // installation and usage (starting/running) of this software may allow other // people or application to access your computer and any attached devices and // is highly dependent on the configuration of the software which must be // done by the user of the software;the author(s) is (are) also // not responsible for proper configuration and usage of the software, even // if provoked by documentation provided together with the software. // // THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION // IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS // FILE AND AS IN http://www.gnu.org/licenses/gpl.txt // ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE // LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT // BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. package de.anomic.htmlFilter; import java.util.HashMap; import java.util.Properties; import java.util.TreeSet; import de.anomic.server.serverByteBuffer; public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final byte lb = (byte) '<'; public static final byte rb = (byte) '>'; public static final byte sl = (byte) '/'; private TreeSet tags0; private TreeSet tags1; // define a translation table for html character codings private static HashMap trans = new HashMap(300); static { trans.put(""", "\""); //Anführungszeichen oben trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und trans.put("<", "<"); //öffnende spitze Klammer trans.put(">", ">"); //schließende spitze Klammer trans.put(" ", " "); //Erzwungenes Leerzeichen trans.put("¡", "!"); //umgekehrtes Ausrufezeichen trans.put("¢", " cent "); //Cent-Zeichen trans.put("£", " pound "); //Pfund-Zeichen trans.put("¤", " currency "); //Währungs-Zeichen trans.put("¥", " yen "); //Yen-Zeichen trans.put("¦", " "); //durchbrochener Strich trans.put("§", " paragraph "); //Paragraph-Zeichen trans.put("¨", " "); //Pünktchen oben trans.put("©", " copyright "); //Copyright-Zeichen trans.put("ª", " "); //Ordinal-Zeichen weiblich trans.put("«", " "); //angewinkelte Anführungszeichen links trans.put("¬", " not "); //Verneinungs-Zeichen trans.put("­", "-"); //kurzer Trennstrich trans.put("®", " trademark "); //Registriermarke-Zeichen trans.put("¯", " "); //Überstrich trans.put("°", " degree "); //Grad-Zeichen trans.put("±", " +/- "); //Plusminus-Zeichen trans.put("²", " square "); //Hoch-2-Zeichen trans.put("³", " 3 "); //Hoch-3-Zeichen trans.put("´", " "); //Acute-Zeichen trans.put("µ", " micro "); //Mikro-Zeichen trans.put("¶", " paragraph "); //Absatz-Zeichen trans.put("·", " "); //Mittelpunkt trans.put("¸", " "); //Häkchen unten trans.put("¹", " "); //Hoch-1-Zeichen trans.put("º", " degree "); //Ordinal-Zeichen männlich trans.put("»", " "); //angewinkelte Anführungszeichen rechts trans.put("¼", " quarter "); //ein Viertel trans.put("½", " half "); //ein Halb trans.put("¾", " 3/4 "); //drei Viertel trans.put("¿", "?"); //umgekehrtes Fragezeichen trans.put("À", "A"); //A mit Accent grave trans.put("Á", "A"); //A mit Accent acute trans.put("Â", "A"); //A mit Circumflex trans.put("Ã", "A"); //A mit Tilde trans.put("Ä", "Ae"); //A Umlaut trans.put("Å", "A"); //A mit Ring trans.put("Æ", "A"); //A mit legiertem E trans.put("Ç", "C"); //C mit Häkchen trans.put("È", "E"); //E mit Accent grave trans.put("É", "E"); //E mit Accent acute trans.put("Ê", "E"); //E mit Circumflex trans.put("Ë", "E"); //E Umlaut trans.put("Ì", "I"); //I mit Accent grave trans.put("Í", "I"); //I mit Accent acute trans.put("Î", "I"); //I mit Circumflex trans.put("Ï", "I"); //I Umlaut trans.put("Ð", "D"); //Eth (isländisch) trans.put("Ñ", "N"); //N mit Tilde trans.put("Ò", "O"); //O mit Accent grave trans.put("Ó", "O"); //O mit Accent acute trans.put("Ô", "O"); //O mit Circumflex trans.put("Õ", "O"); //O mit Tilde trans.put("Ö", "Oe"); //O Umlaut trans.put("×", " times "); //Mal-Zeichen trans.put("Ø", "O"); //O mit Schrägstrich trans.put("Ù", "U"); //U mit Accent grave trans.put("Ú", "U"); //U mit Accent acute trans.put("Û", "U"); //U mit Circumflex trans.put("Ü", "Ue"); //U Umlaut trans.put("Ý", "Y"); //Y mit Accent acute trans.put("Þ", "P"); //THORN (isländisch) trans.put("ß", "ss"); //scharfes S trans.put("à", "a"); //a mit Accent grave trans.put("á", "a"); //a mit Accent acute trans.put("â", "a"); //a mit Circumflex trans.put("ã", "a"); //a mit Tilde trans.put("ä", "ae"); //a Umlaut trans.put("å", "a"); //a mit Ring trans.put("æ", "a"); //a mit legiertem e trans.put("ç", "c"); //c mit Häkchen trans.put("è", "e"); //e mit Accent grave trans.put("é", "e"); //e mit Accent acute trans.put("ê", "e"); //e mit Circumflex trans.put("ë", "e"); //e Umlaut trans.put("ì", "i"); //i mit Accent grave trans.put("í", "i"); //i mit Accent acute trans.put("î", "i"); //i mit Circumflex trans.put("ï", "i"); //i Umlaut trans.put("ð", "d"); //eth (isländisch) trans.put("ñ", "n"); //n mit Tilde trans.put("ò", "o"); //o mit Accent grave trans.put("ó", "o"); //o mit Accent acute trans.put("ô", "o"); //o mit Circumflex trans.put("õ", "o"); //o mit Tilde trans.put("ö", "oe"); //o Umlaut trans.put("÷", "%"); //Divisions-Zeichen trans.put("ø", "o"); //o mit Schrägstrich trans.put("ù", "u"); //u mit Accent grave trans.put("ú", "u"); //u mit Accent acute trans.put("û", "u"); //u mit Circumflex trans.put("ü", "ue"); //u Umlaut trans.put("ý", "y"); //y mit Accent acute trans.put("þ", "p"); //thorn (isländisch) trans.put("ÿ", "y"); //y Umlaut trans.put("Α", " Alpha "); //Alpha groß trans.put("α", " alpha "); //alpha klein trans.put("Β", " Beta "); //Beta groß trans.put("β", " beta "); //beta klein trans.put("Γ", " Gamma "); //Gamma groß trans.put("γ", " gamma "); //gamma klein trans.put("Δ", " Delta "); //Delta groß trans.put("δ", " delta "); //delta klein trans.put("Ε", " Epsilon "); //Epsilon groß trans.put("ε", " epsilon "); //epsilon klein trans.put("Ζ", " Zeta "); //Zeta groß trans.put("ζ", " zeta "); //zeta klein trans.put("Η", " Eta "); //Eta groß trans.put("η", " eta "); //eta klein trans.put("Θ", " Theta "); //Theta groß trans.put("θ", " theta "); //theta klein trans.put("Ι", " Iota "); //Iota groß trans.put("ι", " iota "); //iota klein trans.put("Κ", " Kappa "); //Kappa groß trans.put("κ", " kappa "); //kappa klein trans.put("Λ", " Lambda "); //Lambda groß trans.put("λ", " lambda "); //lambda klein trans.put("Μ", " Mu "); //Mu groß trans.put("μ", " mu "); //mu klein trans.put("Ν", " Nu "); //Nu groß trans.put("ν", " nu "); //nu klein trans.put("Ξ", " Xi "); //Xi groß trans.put("ξ", " xi "); //xi klein trans.put("Ο", " Omicron "); //Omicron groß trans.put("ο", " omicron "); //omicron klein trans.put("Π", " Pi "); //Pi groß trans.put("π", " pi "); //pi klein trans.put("Ρ", " Rho "); //Rho groß trans.put("ρ", " rho "); //rho klein trans.put("Σ", " Sigma "); //Sigma groß trans.put("ς", " sigma "); //sigmaf klein trans.put("σ", " sigma "); //sigma klein trans.put("Τ", " Tau "); //Tau groß trans.put("τ", " tau "); //tau klein trans.put("Υ", " Ypsilon "); //Upsilon groß trans.put("υ", " ypsilon "); //upsilon klein trans.put("Φ", " Phi "); //Phi groß trans.put("φ", " phi "); //phi klein trans.put("Χ", " Chi "); //Chi groß trans.put("χ", " chi "); //chi klein trans.put("Ψ", " Psi "); //Psi groß trans.put("ψ", " psi "); //psi klein trans.put("Ω", " Omega "); //Omega groß trans.put("ω", " omega "); //omega klein trans.put("ϑ", " theta "); //theta Symbol trans.put("ϒ", " ypsilon "); //upsilon mit Haken trans.put("ϖ", " pi "); //pi Symbol trans.put("∀", " for all "); //für alle trans.put("∂", " part of "); //teilweise trans.put("∃", " exists "); //existiert trans.put("∅", " null "); //leer trans.put("∇", " nabla "); //nabla trans.put("∈", " element of "); //Element von trans.put("∉", " not element of "); //kein Element von trans.put("∋", " contains "); //enthält als Element trans.put("∏", " product "); //Produkt trans.put("∑", " sum "); //Summe trans.put("−", " minus "); //minus trans.put("∗", " times "); //Asterisk trans.put("√", " sqare root "); //Quadratwurzel trans.put("∝", " proportional to "); //proportional zu trans.put("∞", " unlimited "); //unendlich trans.put("∠", " angle "); //Winkel trans.put("∧", " and "); //und trans.put("∨", " or "); //oder trans.put("∩", " "); //Schnittpunkt trans.put("∪", " unity "); //Einheit trans.put("∫", " integral "); //Integral trans.put("∴", " cause "); //deshalb trans.put("∼", " similar to "); //ähnlich wie trans.put("≅", " equal "); //annähernd gleich trans.put("≈", " equal "); //beinahe gleich trans.put("≠", " not equal "); //ungleich trans.put("≡", " identical "); //identisch mit trans.put("≤", " smaller or equal than "); //kleiner gleich trans.put("≥", " greater or equal than "); //größer gleich trans.put("⊂", " subset of "); //Untermenge von trans.put("⊃", " superset of "); //Obermenge von trans.put("⊄", " not subset of "); //keine Untermenge von trans.put("⊆", ""); //Untermenge von oder gleich mit trans.put("⊇", ""); //Obermenge von oder gleich mit trans.put("⊕", ""); //Direktsumme trans.put("⊗", ""); //Vektorprodukt trans.put("⊥", ""); //senkrecht zu trans.put("⋅", ""); //Punkt-Operator trans.put("◊", ""); //Raute trans.put("⌈", ""); //links oben trans.put("⌉", ""); //rechts oben trans.put("⌊", ""); //links unten trans.put("⌋", ""); //rechts unten trans.put("⟨", ""); //spitze Klammer links trans.put("⟩", ""); //spitze Klammer rechts trans.put("←", ""); //Pfeil links trans.put("↑", ""); //Pfeil oben trans.put("→", ""); //Pfeil rechts trans.put("↓", ""); //Pfeil unten trans.put("↔", ""); //Pfeil links/rechts trans.put("↵", ""); //Pfeil unten-Knick-links trans.put("⇐", ""); //Doppelpfeil links trans.put("⇑", ""); //Doppelpfeil oben trans.put("⇒", ""); //Doppelpfeil rechts trans.put("⇓", ""); //Doppelpfeil unten trans.put("⇔", ""); //Doppelpfeil links/rechts trans.put("•", ""); //Bullet-Zeichen trans.put("…", ""); //Horizontale Ellipse trans.put("′", ""); //Minutenzeichen trans.put("‾", ""); //Überstrich trans.put("⁄", ""); //Bruchstrich trans.put("℘", ""); //Weierstrass p trans.put("ℑ", ""); //Zeichen für "imaginär" trans.put("ℜ", ""); //Zeichen für "real" trans.put("™", ""); //Trademark-Zeichen trans.put("€", ""); //Euro-Zeichen trans.put("ℵ", ""); //Alef-Symbol trans.put("♠", ""); //Pik-Zeichen trans.put("♣", ""); //Kreuz-Zeichen trans.put("♥", ""); //Herz-Zeichen trans.put("♦", ""); //Karo-Zeichen trans.put(" ", ""); //Leerzeichen Breite n trans.put(" ", ""); //Leerzeichen Breite m trans.put(" ", ""); //Schmales Leerzeichen trans.put("‌", ""); //null breiter Nichtverbinder trans.put("‍", ""); //null breiter Verbinder trans.put("‎", ""); //links-nach-rechts-Zeichen trans.put("‏", ""); //rechts-nach-links-Zeichen trans.put("–", ""); //Gedankenstrich Breite n trans.put("—", ""); //Gedankenstrich Breite m trans.put("‘", ""); //einfaches Anführungszeichen links trans.put("’", ""); //einfaches Anführungszeichen rechts trans.put("‚", ""); //einfaches low-9-Zeichen trans.put("“", ""); //doppeltes Anführungszeichen links trans.put("”", ""); //doppeltes Anführungszeichen rechts trans.put("„", ""); //doppeltes low-9-Zeichen rechts trans.put("†", ""); //Kreuz trans.put("‡", ""); //Doppelkreuz trans.put("‰", ""); //zu tausend trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } public boolean isTag0(String tag) { return (tags0 != null) && (tags0.contains(tag)); } public boolean isTag1(String tag) { return (tags1 != null) && (tags1.contains(tag)); } //the 'missing' method that shall be implemented: public abstract void scrapeText(byte[] text); // the other methods must take into account to construct the return value correctly public void scrapeTag0(String tagname, Properties tagopts) { } public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { } // string conversions private static String code_iso8859s(int c) { switch (c & 0xff) { // german umlaute and ligaturen case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; case 0xdf: return "ss"; // accent on letters; i.e. french characters case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; case 0xc6: return "AE"; case 0xc7: return "C"; case 0xc8: case 0xc9: case 0xca: return "E"; case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; case 0xd0: return "D"; case 0xd1: return "N"; case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; case 0xd7: return "x"; case 0xd9: case 0xda: case 0xdb: return "U"; case 0xdd: return "Y"; case 0xde: return "p"; case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; case 0xe6: return "ae"; case 0xe7: return "c"; case 0xe8: case 0xe9: case 0xea: return "e"; case 0xec: case 0xed: case 0xee: case 0xef: return "i"; case 0xf0: return "d"; case 0xf1: return "n"; case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; case 0xf7: return "%"; case 0xf9: case 0xfa: case 0xfb: return "u"; case 0xfd: case 0xff: return "y"; case 0xfe: return "p"; // special characters case 0xa4: return " euro "; default: return null; } } public static serverByteBuffer convertUmlaute(serverByteBuffer bb) { serverByteBuffer t = new serverByteBuffer(bb.length() + 20); int b0, b1, b2; String z; int i = 0; while (i < bb.length()) { b0 = bb.byteAt(i) & 0xff; // check utf-8 encoding if ((b0 < 128) || (i + 1 == bb.length())) { t.append(b0); i++; } else { b1 = bb.byteAt(i + 1) & 0xff; if (b1 > 0x3f) { z = code_iso8859s(b0); i++; } else if ((b0 > 0xbf) && (b0 < 0xe0)) { z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); i += 2; } else { if (i + 2 >= bb.length()) { z = null; i++; } else { b2 = bb.byteAt(i + 2) & 0xff; if (b2 > 0x3f) { z = code_iso8859s(b0); i++; } else { z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); i += 3; } } } if (z == null) t.append(b0); else t.append(z); } } return t; } private static byte[] transscript(byte[] code) { String t = (String) trans.get(new String(code)); if (t == null) return new byte[0]; return t.getBytes(); } protected static serverByteBuffer transscriptAll(serverByteBuffer bb) { int p0 = 0, p1; byte[] t; while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) { p1 = bb.indexOf((byte) ';', p0); if (p1 >= 0) { t = transscript(bb.getBytes(p0, p1 + 1)); bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1)); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1)); } } t = null; return bb; } protected static serverByteBuffer stripAllTags(serverByteBuffer bb) { int p0 = 0, p1; while ((p0 = bb.indexOf(lb, p0)) >= 0) { p1 = bb.indexOf(rb, p0); if (p1 >= 0) { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim()); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim()); } } return bb.trim(); } public static serverByteBuffer stripAll(serverByteBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); } public void close() { // free resources tags0 = null; tags1 = null; } public void finalize() { close(); } } \ No newline at end of file +// htmlFilterAbstractScraper.java // --------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last major change: 18.02.2004 // // You agree that the Author(s) is (are) not responsible for cost, // loss of data or any harm that may be caused by usage of this softare or // this documentation. The usage of this software is on your own risk. The // installation and usage (starting/running) of this software may allow other // people or application to access your computer and any attached devices and // is highly dependent on the configuration of the software which must be // done by the user of the software;the author(s) is (are) also // not responsible for proper configuration and usage of the software, even // if provoked by documentation provided together with the software. // // THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION // IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS // FILE AND AS IN http://www.gnu.org/licenses/gpl.txt // ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE // LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT // BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. package de.anomic.htmlFilter; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Properties; import java.util.TreeSet; import de.anomic.server.serverByteBuffer; public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final byte lb = (byte) '<'; public static final byte rb = (byte) '>'; public static final byte sl = (byte) '/'; private TreeSet tags0; private TreeSet tags1; // define a translation table for html character codings private static HashMap trans = new HashMap(300); static { trans.put(""", "\""); //Anführungszeichen oben trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und trans.put("<", "<"); //öffnende spitze Klammer trans.put(">", ">"); //schließende spitze Klammer trans.put(" ", " "); //Erzwungenes Leerzeichen trans.put("¡", "!"); //umgekehrtes Ausrufezeichen trans.put("¢", " cent "); //Cent-Zeichen trans.put("£", " pound "); //Pfund-Zeichen trans.put("¤", " currency "); //Währungs-Zeichen trans.put("¥", " yen "); //Yen-Zeichen trans.put("¦", " "); //durchbrochener Strich trans.put("§", " paragraph "); //Paragraph-Zeichen trans.put("¨", " "); //Pünktchen oben trans.put("©", " copyright "); //Copyright-Zeichen trans.put("ª", " "); //Ordinal-Zeichen weiblich trans.put("«", " "); //angewinkelte Anführungszeichen links trans.put("¬", " not "); //Verneinungs-Zeichen trans.put("­", "-"); //kurzer Trennstrich trans.put("®", " trademark "); //Registriermarke-Zeichen trans.put("¯", " "); //Überstrich trans.put("°", " degree "); //Grad-Zeichen trans.put("±", " +/- "); //Plusminus-Zeichen trans.put("²", " square "); //Hoch-2-Zeichen trans.put("³", " 3 "); //Hoch-3-Zeichen trans.put("´", " "); //Acute-Zeichen trans.put("µ", " micro "); //Mikro-Zeichen trans.put("¶", " paragraph "); //Absatz-Zeichen trans.put("·", " "); //Mittelpunkt trans.put("¸", " "); //Häkchen unten trans.put("¹", " "); //Hoch-1-Zeichen trans.put("º", " degree "); //Ordinal-Zeichen männlich trans.put("»", " "); //angewinkelte Anführungszeichen rechts trans.put("¼", " quarter "); //ein Viertel trans.put("½", " half "); //ein Halb trans.put("¾", " 3/4 "); //drei Viertel trans.put("¿", "?"); //umgekehrtes Fragezeichen trans.put("À", "A"); //A mit Accent grave trans.put("Á", "A"); //A mit Accent acute trans.put("Â", "A"); //A mit Circumflex trans.put("Ã", "A"); //A mit Tilde trans.put("Ä", "Ae"); //A Umlaut trans.put("Å", "A"); //A mit Ring trans.put("Æ", "A"); //A mit legiertem E trans.put("Ç", "C"); //C mit Häkchen trans.put("È", "E"); //E mit Accent grave trans.put("É", "E"); //E mit Accent acute trans.put("Ê", "E"); //E mit Circumflex trans.put("Ë", "E"); //E Umlaut trans.put("Ì", "I"); //I mit Accent grave trans.put("Í", "I"); //I mit Accent acute trans.put("Î", "I"); //I mit Circumflex trans.put("Ï", "I"); //I Umlaut trans.put("Ð", "D"); //Eth (isländisch) trans.put("Ñ", "N"); //N mit Tilde trans.put("Ò", "O"); //O mit Accent grave trans.put("Ó", "O"); //O mit Accent acute trans.put("Ô", "O"); //O mit Circumflex trans.put("Õ", "O"); //O mit Tilde trans.put("Ö", "Oe"); //O Umlaut trans.put("×", " times "); //Mal-Zeichen trans.put("Ø", "O"); //O mit Schrägstrich trans.put("Ù", "U"); //U mit Accent grave trans.put("Ú", "U"); //U mit Accent acute trans.put("Û", "U"); //U mit Circumflex trans.put("Ü", "Ue"); //U Umlaut trans.put("Ý", "Y"); //Y mit Accent acute trans.put("Þ", "P"); //THORN (isländisch) trans.put("ß", "ss"); //scharfes S trans.put("à", "a"); //a mit Accent grave trans.put("á", "a"); //a mit Accent acute trans.put("â", "a"); //a mit Circumflex trans.put("ã", "a"); //a mit Tilde trans.put("ä", "ae"); //a Umlaut trans.put("å", "a"); //a mit Ring trans.put("æ", "a"); //a mit legiertem e trans.put("ç", "c"); //c mit Häkchen trans.put("è", "e"); //e mit Accent grave trans.put("é", "e"); //e mit Accent acute trans.put("ê", "e"); //e mit Circumflex trans.put("ë", "e"); //e Umlaut trans.put("ì", "i"); //i mit Accent grave trans.put("í", "i"); //i mit Accent acute trans.put("î", "i"); //i mit Circumflex trans.put("ï", "i"); //i Umlaut trans.put("ð", "d"); //eth (isländisch) trans.put("ñ", "n"); //n mit Tilde trans.put("ò", "o"); //o mit Accent grave trans.put("ó", "o"); //o mit Accent acute trans.put("ô", "o"); //o mit Circumflex trans.put("õ", "o"); //o mit Tilde trans.put("ö", "oe"); //o Umlaut trans.put("÷", "%"); //Divisions-Zeichen trans.put("ø", "o"); //o mit Schrägstrich trans.put("ù", "u"); //u mit Accent grave trans.put("ú", "u"); //u mit Accent acute trans.put("û", "u"); //u mit Circumflex trans.put("ü", "ue"); //u Umlaut trans.put("ý", "y"); //y mit Accent acute trans.put("þ", "p"); //thorn (isländisch) trans.put("ÿ", "y"); //y Umlaut trans.put("Α", " Alpha "); //Alpha groß trans.put("α", " alpha "); //alpha klein trans.put("Β", " Beta "); //Beta groß trans.put("β", " beta "); //beta klein trans.put("Γ", " Gamma "); //Gamma groß trans.put("γ", " gamma "); //gamma klein trans.put("Δ", " Delta "); //Delta groß trans.put("δ", " delta "); //delta klein trans.put("Ε", " Epsilon "); //Epsilon groß trans.put("ε", " epsilon "); //epsilon klein trans.put("Ζ", " Zeta "); //Zeta groß trans.put("ζ", " zeta "); //zeta klein trans.put("Η", " Eta "); //Eta groß trans.put("η", " eta "); //eta klein trans.put("Θ", " Theta "); //Theta groß trans.put("θ", " theta "); //theta klein trans.put("Ι", " Iota "); //Iota groß trans.put("ι", " iota "); //iota klein trans.put("Κ", " Kappa "); //Kappa groß trans.put("κ", " kappa "); //kappa klein trans.put("Λ", " Lambda "); //Lambda groß trans.put("λ", " lambda "); //lambda klein trans.put("Μ", " Mu "); //Mu groß trans.put("μ", " mu "); //mu klein trans.put("Ν", " Nu "); //Nu groß trans.put("ν", " nu "); //nu klein trans.put("Ξ", " Xi "); //Xi groß trans.put("ξ", " xi "); //xi klein trans.put("Ο", " Omicron "); //Omicron groß trans.put("ο", " omicron "); //omicron klein trans.put("Π", " Pi "); //Pi groß trans.put("π", " pi "); //pi klein trans.put("Ρ", " Rho "); //Rho groß trans.put("ρ", " rho "); //rho klein trans.put("Σ", " Sigma "); //Sigma groß trans.put("ς", " sigma "); //sigmaf klein trans.put("σ", " sigma "); //sigma klein trans.put("Τ", " Tau "); //Tau groß trans.put("τ", " tau "); //tau klein trans.put("Υ", " Ypsilon "); //Upsilon groß trans.put("υ", " ypsilon "); //upsilon klein trans.put("Φ", " Phi "); //Phi groß trans.put("φ", " phi "); //phi klein trans.put("Χ", " Chi "); //Chi groß trans.put("χ", " chi "); //chi klein trans.put("Ψ", " Psi "); //Psi groß trans.put("ψ", " psi "); //psi klein trans.put("Ω", " Omega "); //Omega groß trans.put("ω", " omega "); //omega klein trans.put("ϑ", " theta "); //theta Symbol trans.put("ϒ", " ypsilon "); //upsilon mit Haken trans.put("ϖ", " pi "); //pi Symbol trans.put("∀", " for all "); //für alle trans.put("∂", " part of "); //teilweise trans.put("∃", " exists "); //existiert trans.put("∅", " null "); //leer trans.put("∇", " nabla "); //nabla trans.put("∈", " element of "); //Element von trans.put("∉", " not element of "); //kein Element von trans.put("∋", " contains "); //enthält als Element trans.put("∏", " product "); //Produkt trans.put("∑", " sum "); //Summe trans.put("−", " minus "); //minus trans.put("∗", " times "); //Asterisk trans.put("√", " sqare root "); //Quadratwurzel trans.put("∝", " proportional to "); //proportional zu trans.put("∞", " unlimited "); //unendlich trans.put("∠", " angle "); //Winkel trans.put("∧", " and "); //und trans.put("∨", " or "); //oder trans.put("∩", " "); //Schnittpunkt trans.put("∪", " unity "); //Einheit trans.put("∫", " integral "); //Integral trans.put("∴", " cause "); //deshalb trans.put("∼", " similar to "); //ähnlich wie trans.put("≅", " equal "); //annähernd gleich trans.put("≈", " equal "); //beinahe gleich trans.put("≠", " not equal "); //ungleich trans.put("≡", " identical "); //identisch mit trans.put("≤", " smaller or equal than "); //kleiner gleich trans.put("≥", " greater or equal than "); //größer gleich trans.put("⊂", " subset of "); //Untermenge von trans.put("⊃", " superset of "); //Obermenge von trans.put("⊄", " not subset of "); //keine Untermenge von trans.put("⊆", ""); //Untermenge von oder gleich mit trans.put("⊇", ""); //Obermenge von oder gleich mit trans.put("⊕", ""); //Direktsumme trans.put("⊗", ""); //Vektorprodukt trans.put("⊥", ""); //senkrecht zu trans.put("⋅", ""); //Punkt-Operator trans.put("◊", ""); //Raute trans.put("⌈", ""); //links oben trans.put("⌉", ""); //rechts oben trans.put("⌊", ""); //links unten trans.put("⌋", ""); //rechts unten trans.put("⟨", ""); //spitze Klammer links trans.put("⟩", ""); //spitze Klammer rechts trans.put("←", ""); //Pfeil links trans.put("↑", ""); //Pfeil oben trans.put("→", ""); //Pfeil rechts trans.put("↓", ""); //Pfeil unten trans.put("↔", ""); //Pfeil links/rechts trans.put("↵", ""); //Pfeil unten-Knick-links trans.put("⇐", ""); //Doppelpfeil links trans.put("⇑", ""); //Doppelpfeil oben trans.put("⇒", ""); //Doppelpfeil rechts trans.put("⇓", ""); //Doppelpfeil unten trans.put("⇔", ""); //Doppelpfeil links/rechts trans.put("•", ""); //Bullet-Zeichen trans.put("…", ""); //Horizontale Ellipse trans.put("′", ""); //Minutenzeichen trans.put("‾", ""); //Überstrich trans.put("⁄", ""); //Bruchstrich trans.put("℘", ""); //Weierstrass p trans.put("ℑ", ""); //Zeichen für "imaginär" trans.put("ℜ", ""); //Zeichen für "real" trans.put("™", ""); //Trademark-Zeichen trans.put("€", ""); //Euro-Zeichen trans.put("ℵ", ""); //Alef-Symbol trans.put("♠", ""); //Pik-Zeichen trans.put("♣", ""); //Kreuz-Zeichen trans.put("♥", ""); //Herz-Zeichen trans.put("♦", ""); //Karo-Zeichen trans.put(" ", ""); //Leerzeichen Breite n trans.put(" ", ""); //Leerzeichen Breite m trans.put(" ", ""); //Schmales Leerzeichen trans.put("‌", ""); //null breiter Nichtverbinder trans.put("‍", ""); //null breiter Verbinder trans.put("‎", ""); //links-nach-rechts-Zeichen trans.put("‏", ""); //rechts-nach-links-Zeichen trans.put("–", ""); //Gedankenstrich Breite n trans.put("—", ""); //Gedankenstrich Breite m trans.put("‘", ""); //einfaches Anführungszeichen links trans.put("’", ""); //einfaches Anführungszeichen rechts trans.put("‚", ""); //einfaches low-9-Zeichen trans.put("“", ""); //doppeltes Anführungszeichen links trans.put("”", ""); //doppeltes Anführungszeichen rechts trans.put("„", ""); //doppeltes low-9-Zeichen rechts trans.put("†", ""); //Kreuz trans.put("‡", ""); //Doppelkreuz trans.put("‰", ""); //zu tausend trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } public boolean isTag0(String tag) { return (tags0 != null) && (tags0.contains(tag)); } public boolean isTag1(String tag) { return (tags1 != null) && (tags1.contains(tag)); } //the 'missing' method that shall be implemented: public abstract void scrapeText(byte[] text); // the other methods must take into account to construct the return value correctly public void scrapeTag0(String tagname, Properties tagopts) { } public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { } // string conversions private static String code_iso8859s(int c) { switch (c & 0xff) { // german umlaute and ligaturen case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; case 0xdf: return "ss"; // accent on letters; i.e. french characters case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; case 0xc6: return "AE"; case 0xc7: return "C"; case 0xc8: case 0xc9: case 0xca: return "E"; case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; case 0xd0: return "D"; case 0xd1: return "N"; case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; case 0xd7: return "x"; case 0xd9: case 0xda: case 0xdb: return "U"; case 0xdd: return "Y"; case 0xde: return "p"; case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; case 0xe6: return "ae"; case 0xe7: return "c"; case 0xe8: case 0xe9: case 0xea: return "e"; case 0xec: case 0xed: case 0xee: case 0xef: return "i"; case 0xf0: return "d"; case 0xf1: return "n"; case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; case 0xf7: return "%"; case 0xf9: case 0xfa: case 0xfb: return "u"; case 0xfd: case 0xff: return "y"; case 0xfe: return "p"; // special characters case 0xa4: return " euro "; default: return null; } } public static serverByteBuffer convertUmlaute(serverByteBuffer bb) { serverByteBuffer t = new serverByteBuffer(bb.length() + 20); int b0, b1, b2; String z; int i = 0; while (i < bb.length()) { b0 = bb.byteAt(i) & 0xff; // check utf-8 encoding if ((b0 < 128) || (i + 1 == bb.length())) { t.append(b0); i++; } else { b1 = bb.byteAt(i + 1) & 0xff; if (b1 > 0x3f) { z = code_iso8859s(b0); i++; } else if ((b0 > 0xbf) && (b0 < 0xe0)) { z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); i += 2; } else { if (i + 2 >= bb.length()) { z = null; i++; } else { b2 = bb.byteAt(i + 2) & 0xff; if (b2 > 0x3f) { z = code_iso8859s(b0); i++; } else { z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); i += 3; } } } if (z == null) t.append(b0); else t.append(z); } } return t; } private static byte[] transscript(byte[] code) { String t; try { t = (String) trans.get(new String(code, "UTF-8")); } catch (UnsupportedEncodingException e) { t = null; } if (t == null) return new byte[0]; return t.getBytes(); } protected static serverByteBuffer transscriptAll(serverByteBuffer bb) { int p0 = 0, p1; byte[] t; while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) { p1 = bb.indexOf((byte) ';', p0); if (p1 >= 0) { t = transscript(bb.getBytes(p0, p1 + 1)); bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1)); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1)); } } t = null; return bb; } protected static serverByteBuffer stripAllTags(serverByteBuffer bb) { int p0 = 0, p1; while ((p0 = bb.indexOf(lb, p0)) >= 0) { p1 = bb.indexOf(rb, p0); if (p1 >= 0) { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim()); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim()); } } return bb.trim(); } public static serverByteBuffer stripAll(serverByteBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); } public void close() { // free resources tags0 = null; tags1 = null; } public void finalize() { close(); } } \ No newline at end of file diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index ffd66871e..3f987fcf5 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -230,7 +230,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } public byte[] getText() { - return content.getBytes(); + return content.getBytes(); } public Map getAnchors() { diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java index d619dd021..5cd92349b 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java @@ -46,6 +46,7 @@ package de.anomic.htmlFilter; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; +import java.io.UnsupportedEncodingException; import java.text.Collator; import java.util.ArrayList; import java.util.Locale; @@ -117,7 +118,12 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer private boolean hit(byte[] text) { if (text == null || bluelist == null) return false; - String lc = new String(text).toLowerCase(); + String lc; + try { + lc = new String(text, "UTF-8").toLowerCase(); + } catch (UnsupportedEncodingException e) { + lc = new String(text).toLowerCase(); + } for (int i = 0; i < bluelist.size(); i++) { if (lc.indexOf((String) bluelist.get(i)) >= 0) return true; } diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java index bd41bb86e..5d616bff3 100644 --- a/source/de/anomic/http/httpd.java +++ b/source/de/anomic/http/httpd.java @@ -710,7 +710,7 @@ public final class httpd implements serverHandler { bout.close(); bout = null; } - int argc = parseArgs(args, new String(buffer)); + int argc = parseArgs(args, new String(buffer, "UTF-8")); buffer = null; return argc; } diff --git a/source/de/anomic/kelondro/kelondroAbstractRA.java b/source/de/anomic/kelondro/kelondroAbstractRA.java index e48a76d84..4663de6de 100644 --- a/source/de/anomic/kelondro/kelondroAbstractRA.java +++ b/source/de/anomic/kelondro/kelondroAbstractRA.java @@ -152,7 +152,7 @@ abstract class kelondroAbstractRA implements kelondroRA { return new String(bb, 0, bbsize); } if (c == cr) continue; - if (c == lf) return new String(bb, 0, bbsize); + if (c == lf) return new String(bb, 0, bbsize, "UTF-8"); // append to bb if (bbsize == bb.length) { diff --git a/source/de/anomic/kelondro/kelondroArray.java b/source/de/anomic/kelondro/kelondroArray.java index b31616b45..37ac43676 100644 --- a/source/de/anomic/kelondro/kelondroArray.java +++ b/source/de/anomic/kelondro/kelondroArray.java @@ -98,7 +98,6 @@ public class kelondroArray extends kelondroRecords { return getNode(new Handle(index)).getValues(); } - public synchronized int seti(int index, int value) throws IOException { int before = getHandle(index).hashCode(); setHandle(index, new Handle(value)); @@ -109,13 +108,28 @@ public class kelondroArray extends kelondroRecords { return getHandle(index).hashCode(); } + public synchronized int add(byte[][] row) throws IOException { + if (row.length != columns()) + throw new IllegalArgumentException("add: wrong row length " + row.length + "; must be " + columns()); + + Node n = newNode(); + n.commit(CP_LOW); + int index = n.handle().hashCode(); + set(index, row); + return index; + } + + public synchronized void remove(int index) throws IOException { + deleteNode(new Handle(index)); + } + public void print() throws IOException { System.out.println("PRINTOUT of table, length=" + size()); byte[][] row; for (int i = 0; i < size(); i++) { System.out.print("row " + i + ": "); row = get(i); - for (int j = 0; j < columns(); j++) System.out.print(((row[j] == null) ? "NULL" : new String(row[j])) + ", "); + for (int j = 0; j < columns(); j++) System.out.print(((row[j] == null) ? "NULL" : new String(row[j], "UTF-8")) + ", "); System.out.println(); } System.out.println("EndOfTable"); @@ -160,6 +174,20 @@ public class kelondroArray extends kelondroRecords { fm.set(Integer.parseInt(args[2]), row); fm.close(); } else + if ((args.length == 3) && (args[0].equals("-a"))) { + // add + kelondroArray fm = new kelondroArray(new File(args[1])); + byte[][] row = new byte[][] { args[2].getBytes() }; + int index = fm.add(row); + System.out.println("Added to row " + index); + fm.close(); + } else + if ((args.length == 3) && (args[0].equals("-d"))) { + // delete + kelondroArray fm = new kelondroArray(new File(args[1])); + fm.remove(Integer.parseInt(args[2])); + fm.close(); + } else if ((args.length == 1) && (args[0].equals("-test"))) { File testfile = new File("test.array"); if (testfile.exists()) testfile.delete(); diff --git a/source/de/anomic/kelondro/kelondroBase64Order.java b/source/de/anomic/kelondro/kelondroBase64Order.java index c586ceb58..a81e4fa0b 100644 --- a/source/de/anomic/kelondro/kelondroBase64Order.java +++ b/source/de/anomic/kelondro/kelondroBase64Order.java @@ -52,6 +52,19 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond private static final char[] alpha_standard = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".toCharArray(); private static final char[] alpha_enhanced = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_".toCharArray(); + private static final byte[] ahpla_standard = new byte[256]; + private static final byte[] ahpla_enhanced = new byte[256]; + + static { + for (int i = 0; i < 256; i++) { + ahpla_standard[i] = -1; + ahpla_enhanced[i] = -1; + } + for (int i = 0; i < alpha_standard.length; i++) { + ahpla_standard[alpha_standard[i]] = (byte) i; + ahpla_enhanced[alpha_enhanced[i]] = (byte) i; + } + } public static final kelondroBase64Order standardCoder = new kelondroBase64Order(true); public static final kelondroBase64Order enhancedCoder = new kelondroBase64Order(false); @@ -59,15 +72,14 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond final boolean rfc1113compliant; private final char[] alpha; - private final byte[] ahpla = new byte[256]; + private final byte[] ahpla; public kelondroBase64Order(boolean rfc1113compliant) { // if we choose not to be rfc1113compliant, // then we get shorter base64 results which are also filename-compatible this.rfc1113compliant = rfc1113compliant; alpha = (rfc1113compliant) ? alpha_standard : alpha_enhanced; - for (int i = 0; i < 256; i++) ahpla[i] = -1; - for (int i = 0; i < alpha.length; i++) ahpla[alpha[i]] = (byte) i; + ahpla = (rfc1113compliant) ? ahpla_standard : ahpla_enhanced; } public char encodeByte(byte b) { @@ -143,7 +155,8 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond public String decodeString(String in) { try { - return new String(decode(in), "ISO-8859-1"); + //return new String(decode(in), "ISO-8859-1"); + return new String(decode(in), "UTF-8"); } catch (java.io.UnsupportedEncodingException e) { System.out.println("internal error in base64: " + e.getMessage()); return null; diff --git a/source/de/anomic/kelondro/kelondroHashtable.java b/source/de/anomic/kelondro/kelondroHashtable.java index 6dd82c473..62f60fb75 100644 --- a/source/de/anomic/kelondro/kelondroHashtable.java +++ b/source/de/anomic/kelondro/kelondroHashtable.java @@ -230,7 +230,7 @@ public class kelondroHashtable { rowNumber = hash.node(); if (rowNumber >= hashArray.size()) return new Object[]{new Integer(rowNumber), null}; row = hashArray.get(rowNumber); - rowKey = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(row[0])); + rowKey = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(row[0], "UTF-8")); if (rowKey == 0) return new Object[]{new Integer(rowNumber), null}; hash.rehash(); } while (rowKey != hash.key()); diff --git a/source/de/anomic/kelondro/kelondroNaturalOrder.java b/source/de/anomic/kelondro/kelondroNaturalOrder.java index 276029d5c..d7b29cb7b 100644 --- a/source/de/anomic/kelondro/kelondroNaturalOrder.java +++ b/source/de/anomic/kelondro/kelondroNaturalOrder.java @@ -68,6 +68,10 @@ public class kelondroNaturalOrder extends kelondroAbstractOrder implements kelon // two arrays are also equal if one array is a subset of the other's array // with filled-up char(0)-values public int compare(byte[] a, byte[] b) { + return compares(a, b); + } + + public static final int compares(byte[] a, byte[] b) { int i = 0; final int al = a.length; final int bl = b.length; diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java index d006034e0..ea97298c2 100644 --- a/source/de/anomic/kelondro/kelondroRecords.java +++ b/source/de/anomic/kelondro/kelondroRecords.java @@ -790,7 +790,7 @@ public class kelondroRecords { if (h == null) s = s + ":hNULL"; else s = s + ":h" + h.toString(); } byte[][] content = getValues(); - for (int i = 0; i < content.length; i++) s = s + ":" + ((content[i] == null) ? "NULL" : (new String(content[i])).trim()); + for (int i = 0; i < content.length; i++) s = s + ":" + ((content[i] == null) ? "NULL" : (new String(content[i], "UTF-8")).trim()); } catch (IOException e) { s = s + ":***LOAD ERROR***:" + e.getMessage(); } diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index 3b9708472..f029df9d7 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -130,11 +130,11 @@ public class odtParser extends AbstractParser implements Parser { if (docShortTitle != null) { docLongTitle = docShortTitle; } else if (docContent.length <= 80) { - docLongTitle = new String(docContent); + docLongTitle = new String(docContent, "UTF-8"); } else { byte[] title = new byte[80]; System.arraycopy(docContent, 0, title, 0, 80); - docLongTitle = new String(title); + docLongTitle = new String(title, "UTF-8"); } docLongTitle. replaceAll("\r\n"," "). diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index adb468347..09e6fd572 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -127,7 +127,7 @@ public class pdfParser extends AbstractParser implements Parser { out = null; if ((docTitle == null) || (docTitle.length() == 0)) { - docTitle = ((contents.length > 80)? new String(contents, 0, 80):new String(contents)). + docTitle = ((contents.length > 80)? new String(contents, 0, 80, "UTF-8"):new String(contents, "UTF-8")). replaceAll("\r\n"," "). replaceAll("\n"," "). replaceAll("\r"," "). diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 6f79bbf41..3a55e8ca6 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -171,15 +171,15 @@ public class plasmaCrawlEURL extends plasmaURL { this.hash = hash; byte[][] entry = urlHashCache.get(hash.getBytes()); if (entry != null) { - this.referrer = new String(entry[1]); - this.initiator = new String(entry[2]); - this.executor = new String(entry[3]); - this.url = new URL(new String(entry[4]).trim()); - this.name = new String(entry[5]).trim(); - this.initdate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[6]))); - this.trydate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[7]))); - this.trycount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[8])); - this.failreason = new String(entry[9]); + this.referrer = new String(entry[1], "UTF-8"); + this.initiator = new String(entry[2], "UTF-8"); + this.executor = new String(entry[3], "UTF-8"); + this.url = new URL(new String(entry[4], "UTF-8").trim()); + this.name = new String(entry[5], "UTF-8").trim(); + this.initdate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[6], "UTF-8"))); + this.trydate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[7], "UTF-8"))); + this.trycount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[8], "UTF-8")); + this.failreason = new String(entry[9], "UTF-8"); this.flags = new bitfield(entry[10]); return; } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 22bcb772e..f85ea0b75 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -475,18 +475,18 @@ public final class plasmaCrawlLURL extends plasmaURL { if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL"); try { if (entry != null) { - this.url = new URL(new String(entry[1]).trim()); - this.descr = (entry[2] == null) ? this.url.toString() : new String(entry[2]).trim(); - this.moddate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[3]))); - this.loaddate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[4]))); - this.referrerHash = (entry[5] == null) ? dummyHash : new String(entry[5]); - this.copyCount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[6])); - this.flags = new String(entry[7]); - this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[8])); - this.language = new String(entry[9]); + this.url = new URL(new String(entry[1], "UTF-8").trim()); + this.descr = (entry[2] == null) ? this.url.toString() : new String(entry[2], "UTF-8").trim(); + this.moddate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[3], "UTF-8"))); + this.loaddate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[4], "UTF-8"))); + this.referrerHash = (entry[5] == null) ? dummyHash : new String(entry[5], "UTF-8"); + this.copyCount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[6], "UTF-8")); + this.flags = new String(entry[7], "UTF-8"); + this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[8], "UTF-8")); + this.language = new String(entry[9], "UTF-8"); this.doctype = (char) entry[10][0]; - this.size = kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[11])); - this.wordCount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[12])); + this.size = kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[11], "UTF-8")); + this.wordCount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[12], "UTF-8")); this.snippet = null; return; } diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index e559bb077..817490f74 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -189,43 +189,43 @@ public class plasmaCrawlNURL extends plasmaURL { Iterator i; try { i = coreStack.iterator(); - while (i.hasNext()) stackIndex.add(new String((byte[]) i.next())); + while (i.hasNext()) stackIndex.add(new String((byte[]) i.next(), "UTF-8")); } catch (Exception e) { coreStack.reset(); } try { i = limitStack.iterator(); - while (i.hasNext()) stackIndex.add(new String((byte[]) i.next())); + while (i.hasNext()) stackIndex.add(new String((byte[]) i.next(), "UTF-8")); } catch (Exception e) { limitStack.reset(); } try { i = overhangStack.iterator(); - while (i.hasNext()) stackIndex.add(new String((byte[]) i.next())); + while (i.hasNext()) stackIndex.add(new String((byte[]) i.next(), "UTF-8")); } catch (Exception e) { overhangStack.reset(); } try { i = remoteStack.iterator(); - while (i.hasNext()) stackIndex.add(new String((byte[]) i.next())); + while (i.hasNext()) stackIndex.add(new String((byte[]) i.next(), "UTF-8")); } catch (Exception e) { remoteStack.reset(); } try { i = imageStack.iterator(); - while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); } catch (Exception e) { imageStack = kelondroStack.reset(imageStack); } try { i = movieStack.iterator(); - while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); } catch (Exception e) { movieStack = kelondroStack.reset(movieStack); } try { i = musicStack.iterator(); - while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); } catch (Exception e) { musicStack = kelondroStack.reset(musicStack); } diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 1dc1a71f1..9eac20cef 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -401,17 +401,17 @@ public final class plasmaCrawlStacker { try { this.urlHash = urlHash; - this.initiator = new String(entryBytes[1]); - this.url = new String(entryBytes[2]).trim(); - this.referrerHash = (entryBytes[3]==null) ? plasmaURL.dummyHash : new String(entryBytes[3]); - this.name = (entryBytes[4] == null) ? "" : new String(entryBytes[4]).trim(); - this.loaddate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entryBytes[5]))); - this.profileHandle = (entryBytes[6] == null) ? null : new String(entryBytes[6]).trim(); - this.depth = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entryBytes[7])); - this.anchors = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entryBytes[8])); - this.forkfactor = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entryBytes[9])); + this.initiator = new String(entryBytes[1], "UTF-8"); + this.url = new String(entryBytes[2], "UTF-8").trim(); + this.referrerHash = (entryBytes[3]==null) ? plasmaURL.dummyHash : new String(entryBytes[3], "UTF-8"); + this.name = (entryBytes[4] == null) ? "" : new String(entryBytes[4], "UTF-8").trim(); + this.loaddate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entryBytes[5], "UTF-8"))); + this.profileHandle = (entryBytes[6] == null) ? null : new String(entryBytes[6], "UTF-8").trim(); + this.depth = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entryBytes[7], "UTF-8")); + this.anchors = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entryBytes[8], "UTF-8")); + this.forkfactor = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entryBytes[9], "UTF-8")); this.flags = new bitfield(entryBytes[10]); - this.handle = Integer.parseInt(new String(entryBytes[11])); + this.handle = Integer.parseInt(new String(entryBytes[11], "UTF-8")); } catch (Exception e) { e.printStackTrace(); throw new IllegalStateException(); diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index dfc6af4e1..7c5dd713e 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -201,21 +201,21 @@ public class plasmaSwitchboardQueue { this.referrerURL = null; } - public Entry(byte[][] row) { - long ims = (row[2] == null) ? 0 : kelondroBase64Order.enhancedCoder.decodeLong(new String(row[2])); + public Entry(byte[][] row) throws IOException { + long ims = (row[2] == null) ? 0 : kelondroBase64Order.enhancedCoder.decodeLong(new String(row[2], "UTF-8")); byte flags = (row[3] == null) ? 0 : row[3][0]; try { - this.url = new URL(new String(row[0])); + this.url = new URL(new String(row[0], "UTF-8")); } catch (MalformedURLException e) { this.url = null; } - this.referrerHash = (row[1] == null) ? null : new String(row[1]); + this.referrerHash = (row[1] == null) ? null : new String(row[1], "UTF-8"); this.ifModifiedSince = (ims == 0) ? null : new Date(ims); this.flags = ((flags & 1) == 1) ? (byte) 1 : (byte) 0; - this.initiator = (row[4] == null) ? null : new String(row[4]); - this.depth = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(row[5])); - this.profileHandle = new String(row[6]); - this.anchorName = (row[7] == null) ? null : (new String(row[7])).trim(); + this.initiator = (row[4] == null) ? null : new String(row[4], "UTF-8"); + this.depth = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(row[5], "UTF-8")); + this.profileHandle = new String(row[6], "UTF-8"); + this.anchorName = (row[7] == null) ? null : (new String(row[7], "UTF-8")).trim(); this.profileEntry = null; this.responseHeader = null; diff --git a/source/de/anomic/plasma/plasmaWordConnotation.java b/source/de/anomic/plasma/plasmaWordConnotation.java index 77dfb1c40..81bb96025 100644 --- a/source/de/anomic/plasma/plasmaWordConnotation.java +++ b/source/de/anomic/plasma/plasmaWordConnotation.java @@ -73,7 +73,7 @@ public class plasmaWordConnotation { //reference = reference.toLowerCase(); byte[][] record = refDB.get(word, reference.getBytes()); long c; - if (record == null) c = 0; else c = kelondroBase64Order.enhancedCoder.decodeLong(new String(record[1])); + if (record == null) c = 0; else c = kelondroBase64Order.enhancedCoder.decodeLong(new String(record[1], "UTF-8")); record[1] = kelondroBase64Order.enhancedCoder.encodeLong(c++, countlength).getBytes(); refDB.put(word, record); } diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index 7466f86ed..f6aea3800 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -200,8 +200,7 @@ public final class plasmaWordIndexAssortment { for (int i = 0; i < assortmentLength; i++) { container.add( new plasmaWordIndexEntry[] { new plasmaWordIndexEntry( - new String(row[3 + 2 * i]), new String( - row[4 + 2 * i])) }, updateTime); + new String(row[3 + 2 * i]), new String(row[4 + 2 * i])) }, updateTime); } return container; } diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 1aa645af2..f06f0b036 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -193,9 +193,9 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { // get out one entry row = dumpArray.get(i); if ((row[0] == null) || (row[1] == null) || (row[2] == null) || (row[3] == null) || (row[4] == null)) continue; - wordHash = new String(row[0]); + wordHash = new String(row[0], "UTF-8"); creationTime = kelondroRecords.bytes2long(row[2]); - wordEntry = new plasmaWordIndexEntry(new String(row[3]), new String(row[4])); + wordEntry = new plasmaWordIndexEntry(new String(row[3], "UTF-8"), new String(row[4], "UTF-8")); // store to cache addEntry(wordHash, wordEntry, creationTime); urlCount++; diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index 8869e7baf..3b2c42324 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -241,7 +241,7 @@ public final class serverFileUtils { public static Set loadSet(File file, String sep, boolean tree) throws IOException { Set set = (tree) ? (Set) new TreeSet() : (Set) new HashSet(); byte[] b = read(file); - StringTokenizer st = new StringTokenizer(new String(b), sep); + StringTokenizer st = new StringTokenizer(new String(b, "UTF-8"), sep); while (st.hasMoreTokens()) { set.add(st.nextToken()); } diff --git a/source/de/anomic/soap/httpdSoapHandler.java b/source/de/anomic/soap/httpdSoapHandler.java index 42aeb55bc..eba277248 100644 --- a/source/de/anomic/soap/httpdSoapHandler.java +++ b/source/de/anomic/soap/httpdSoapHandler.java @@ -468,7 +468,7 @@ public final class httpdSoapHandler extends httpdAbstractHandler implements http if (templates[i].endsWith(".template")) try { //System.out.println("TEMPLATE " + templates[i].substring(0, templates[i].length() - 9) + ": " + new String(buf, 0, c)); result.put(templates[i].substring(0, templates[i].length() - 9), - new String(serverFileUtils.read(new File(path, templates[i])))); + new String(serverFileUtils.read(new File(path, templates[i])), "UTF-8")); } catch (Exception e) {} } return result; diff --git a/source/de/anomic/soap/httpdSoapService.java b/source/de/anomic/soap/httpdSoapService.java index bc1987c8b..fffa1b5bf 100644 --- a/source/de/anomic/soap/httpdSoapService.java +++ b/source/de/anomic/soap/httpdSoapService.java @@ -232,7 +232,7 @@ public class httpdSoapService // convert it into a byte array and send it back as result byte[] result = o.toByteArray(); - return new String(result); + return new String(result, "UTF-8"); } catch (Exception e) { throw new AxisFault(e.getMessage()); } diff --git a/source/de/anomic/tools/cryptbig.java b/source/de/anomic/tools/cryptbig.java index 1c0f686f3..d1d999c7e 100644 --- a/source/de/anomic/tools/cryptbig.java +++ b/source/de/anomic/tools/cryptbig.java @@ -164,7 +164,7 @@ public class cryptbig { if (b64dec == null) return null; // error in input string (inconsistency) byte[] dec = decryptArray(b64dec); if (dec == null) return null; - return new String(dec, "UTF8"); + return new String(dec, "UTF-8"); } catch (UnsupportedEncodingException e) { } return null; diff --git a/source/de/anomic/tools/gzip.java b/source/de/anomic/tools/gzip.java index e1a94807c..31216247c 100644 --- a/source/de/anomic/tools/gzip.java +++ b/source/de/anomic/tools/gzip.java @@ -105,7 +105,7 @@ public class gzip { copy(fout, fin, 128); fin.close(); fout.close(); - return new String(fout.toByteArray(), "UTF8"); + return new String(fout.toByteArray(), "UTF-8"); } catch (IOException e) { System.err.println("ERROR: IO trouble"); return null; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index b8f7c0cb0..1e4af3f96 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -44,6 +44,7 @@ package de.anomic.yacy; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.net.URL; import java.util.ArrayList; import java.util.Date; @@ -550,8 +551,11 @@ public final class yacyClient { post.put("youare", targetHash); post.put("subject", subject); post.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date())); - post.put("message", new String(message)); - + try { + post.put("message", new String(message, "UTF-8")); + } catch (UnsupportedEncodingException e) { + post.put("message", new String(message)); + } // get target address String address = targetAddress(targetHash); diff --git a/source/yacy.java b/source/yacy.java index d0989b2ce..03f316e53 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -1165,7 +1165,7 @@ public final class yacy { entry = (plasmaCrawlLURL.Entry) eiter.next(); if ((entry != null) && (entry.url() != null)) { if (html) { - bos.write(("" + entry.descr() + "
").getBytes()); + bos.write(("" + entry.descr() + "
").getBytes("UTF-8")); bos.write(serverCore.crlf); } else { bos.write(entry.url().toString().getBytes());