*) Bugfix. re-enabling inheritance of serverCharBuffer from writer class

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2618 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 97d2a08ef1
commit ad7f600f25

File diff suppressed because one or more lines are too long

@ -43,6 +43,9 @@
package de.anomic.htmlFilter;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
@ -57,7 +60,9 @@ import java.util.Properties;
import java.util.TreeSet;
import de.anomic.net.URL;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCharBuffer;
import de.anomic.server.serverFileUtils;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
@ -113,6 +118,23 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.content = new serverCharBuffer(1024);
}
public void transformCharset(String oldCharset, String newCharset) throws UnsupportedEncodingException {
// // convert the content back to the old bytearray
// ByteArrayInputStream temp = new ByteArrayInputStream(new String(this.content.getChars()).getBytes(oldCharset));
//
// // create a reader with the new charset
// serverCharBuffer newContent = new serverCharBuffer(this.content.length());
// try {
// InputStreamReader reader = new InputStreamReader(temp,newCharset);
// serverFileUtils.copy(reader, newContent);
// reader.close();
// } catch (IOException e) {
// // ignore this
// }
//
// this.content = newContent;
}
public void scrapeText(char[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32);
@ -248,8 +270,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
public byte[] getText() {
return this.getText("UTF-8");
}
public byte[] getText(String charSet) {
try {
return content.toString().getBytes("UTF-8");
return content.toString().getBytes(charSet);
} catch (UnsupportedEncodingException e) {
return content.toString().getBytes();
}

@ -46,6 +46,7 @@ package de.anomic.htmlFilter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.text.Collator;
import java.util.ArrayList;
import java.util.Locale;
@ -112,6 +113,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
}
private static char[] genBlueLetters(int length) {
try {
serverCharBuffer bb = new serverCharBuffer(" <FONT COLOR=#0000FF>".toCharArray());
length = length / 2;
if (length > 10) length = 7;
@ -120,6 +122,10 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
}
bb.append("</FONT> ");
return bb.getChars();
} catch (IOException e) {
// ignore this.
return null;
}
}
private boolean bluelistHit(char[] text) {

@ -117,6 +117,7 @@ public final class htmlFilterWriter extends Writer {
}
public static char[] genTag0raw(String tagname, boolean opening, char[] tagopts) {
try {
serverCharBuffer bb = new serverCharBuffer(tagname.length() + tagopts.length + 3);
bb.append('<');
if (!opening) {
@ -130,9 +131,14 @@ public final class htmlFilterWriter extends Writer {
}
bb.append('>');
return bb.getChars();
} catch (IOException e) {
// ignore this
return null;
}
}
public static char[] genTag1raw(String tagname, char[] tagopts, char[] text) {
try {
serverCharBuffer bb = new serverCharBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
bb.append('<').append(tagname);
if (tagopts.length > 0) {
@ -144,9 +150,14 @@ public final class htmlFilterWriter extends Writer {
bb.append(text);
bb.append('<').append('/').append(tagname).append('>');
return bb.getChars();
} catch (IOException e) {
// ignore this
return null;
}
}
public static char[] genTag0(String tagname, Properties tagopts, char quotechar) {
try {
char[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar);
serverCharBuffer bb = new serverCharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
bb.append('<').append(tagname);
@ -156,17 +167,27 @@ public final class htmlFilterWriter extends Writer {
}
bb.append('>');
return bb.getChars();
} catch (IOException e) {
// ignore this
return null;
}
}
public static char[] genTag1(String tagname, Properties tagopts, char[] text, char quotechar) {
try {
char[] gt0 = genTag0(tagname, tagopts, quotechar);
serverCharBuffer cb = new serverCharBuffer(gt0, gt0.length + text.length + tagname.length() + 3);
cb.append(text).append('<').append('/').append(tagname).append('>');
return cb.getChars();
} catch (IOException e) {
// ignore this
return null;
}
}
// a helper method for pretty-printing of properties for html tags
public static char[] genOpts(Properties prop, char quotechar) {
try {
Enumeration e = prop.propertyNames();
serverCharBuffer bb = new serverCharBuffer(prop.size() * 40);
String key;
@ -178,6 +199,10 @@ public final class htmlFilterWriter extends Writer {
}
if (bb.length() > 0) return bb.getChars(1);
return bb.getChars();
}catch (IOException e) {
// ignore this
return null;
}
}
private char[] filterTag(String tag, boolean opening, char[] content, char quotechar) {

@ -598,6 +598,14 @@ public final class plasmaParser {
public plasmaParserDocument transformScraper(URL location, String mimeType, String charSet, htmlFilterContentScraper scraper) {
try {
if (scraper.getMetas().containsKey("content-type")) {
String newCharset = (String) scraper.getMetas().get("content-type");
if (!charSet.equals(newCharset)) {
// TODO: transformation of content needed
this.theLogger.logFine("Charset transformation needed from '" + charSet + "' to '" + newCharset + "'");
}
}
String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];

@ -50,7 +50,7 @@ import java.io.IOException;
import java.io.Writer;
import java.util.Properties;
public final class serverCharBuffer /* extends Writer */ {
public final class serverCharBuffer extends Writer {
public static final char singlequote = '\'';
public static final char doublequote = '"';
@ -164,10 +164,10 @@ public final class serverCharBuffer /* extends Writer */ {
length += le;
}
public serverCharBuffer append(char b) {
write(b);
return this;
}
// public serverCharBuffer append(char b) {
// write(b);
// return this;
// }
public serverCharBuffer append(int i) {
write((char) (i));

Loading…
Cancel
Save