*) some TODO makers for UTF-8 problem

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2586 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent b5904705ab
commit b0e8ff6eda

@ -148,14 +148,14 @@ public class rssParser extends AbstractParser implements Parser {
anchors.put(itemURL.toString(),itemTitle); anchors.put(itemURL.toString(),itemTitle);
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32); if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
text.append(new serverByteBuffer(htmlFilterAbstractScraper.stripAll(new serverByteBuffer(itemDescr.getBytes()))).trim()).append((byte) ' '); text.append(new serverByteBuffer(htmlFilterAbstractScraper.stripAll(new serverByteBuffer(itemDescr.getBytes("UTF-8")))).trim()).append((byte) ' '); // TODO: this does not work for utf-8
String itemContent = item.getElementValue("content"); String itemContent = item.getElementValue("content");
if ((itemContent != null) && (itemContent.length() > 0)) { if ((itemContent != null) && (itemContent.length() > 0)) {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(itemURL); htmlFilterContentScraper scraper = new htmlFilterContentScraper(itemURL);
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.copy(new ByteArrayInputStream(itemContent.getBytes()), os); serverFileUtils.copy(new ByteArrayInputStream(itemContent.getBytes("UTF-8")), os);
String itemHeadline = scraper.getTitle(); String itemHeadline = scraper.getTitle();
if ((itemHeadline != null) && (itemHeadline.length() > 0)) { if ((itemHeadline != null) && (itemHeadline.length() > 0)) {

@ -146,6 +146,7 @@ import de.anomic.kelondro.kelondroMapTable;
import de.anomic.plasma.dbImport.dbImportManager; import de.anomic.plasma.dbImport.dbImportManager;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverAbstractSwitch; import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.server.serverDate; import de.anomic.server.serverDate;
import de.anomic.server.serverInstantThread; import de.anomic.server.serverInstantThread;
@ -1700,6 +1701,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
storageEndTime = System.currentTimeMillis(); storageEndTime = System.currentTimeMillis();
if (log.isInfo()) { if (log.isInfo()) {
// TODO: UTF-8 docDescription seems not to be displayed correctly because
// of string concatenation
log.logInfo("*Indexed " + words + " words in URL " + entry.url() + log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
" [" + entry.urlHash() + "]" + " [" + entry.urlHash() + "]" +
"\n\tDescription: " + docDescription + "\n\tDescription: " + docDescription +

Loading…
Cancel
Save