*) Bugfix. re-enabling inheritance of serverCharBuffer from writer class

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2618 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 97d2a08ef1
commit ad7f600f25

File diff suppressed because one or more lines are too long

@ -43,6 +43,9 @@
package de.anomic.htmlFilter; package de.anomic.htmlFilter;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
@ -57,7 +60,9 @@ import java.util.Properties;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCharBuffer; import de.anomic.server.serverCharBuffer;
import de.anomic.server.serverFileUtils;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
@ -113,6 +118,23 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.content = new serverCharBuffer(1024); this.content = new serverCharBuffer(1024);
} }
public void transformCharset(String oldCharset, String newCharset) throws UnsupportedEncodingException {
// // convert the content back to the old bytearray
// ByteArrayInputStream temp = new ByteArrayInputStream(new String(this.content.getChars()).getBytes(oldCharset));
//
// // create a reader with the new charset
// serverCharBuffer newContent = new serverCharBuffer(this.content.length());
// try {
// InputStreamReader reader = new InputStreamReader(temp,newCharset);
// serverFileUtils.copy(reader, newContent);
// reader.close();
// } catch (IOException e) {
// // ignore this
// }
//
// this.content = newContent;
}
public void scrapeText(char[] newtext) { public void scrapeText(char[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext)); // System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32); if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32);
@ -246,10 +268,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
for (int j = 0; j < headlines[i - 1].size(); j++) s[j] = (String) headlines[i - 1].get(j); for (int j = 0; j < headlines[i - 1].size(); j++) s[j] = (String) headlines[i - 1].get(j);
return s; return s;
} }
public byte[] getText() { public byte[] getText() {
return this.getText("UTF-8");
}
public byte[] getText(String charSet) {
try { try {
return content.toString().getBytes("UTF-8"); return content.toString().getBytes(charSet);
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
return content.toString().getBytes(); return content.toString().getBytes();
} }

@ -46,6 +46,7 @@ package de.anomic.htmlFilter;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException;
import java.text.Collator; import java.text.Collator;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Locale; import java.util.Locale;
@ -112,14 +113,19 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
} }
private static char[] genBlueLetters(int length) { private static char[] genBlueLetters(int length) {
serverCharBuffer bb = new serverCharBuffer(" <FONT COLOR=#0000FF>".toCharArray()); try {
length = length / 2; serverCharBuffer bb = new serverCharBuffer(" <FONT COLOR=#0000FF>".toCharArray());
if (length > 10) length = 7; length = length / 2;
while (length-- > 0) { if (length > 10) length = 7;
bb.append('X'); while (length-- > 0) {
bb.append('X');
}
bb.append("</FONT> ");
return bb.getChars();
} catch (IOException e) {
// ignore this.
return null;
} }
bb.append("</FONT> ");
return bb.getChars();
} }
private boolean bluelistHit(char[] text) { private boolean bluelistHit(char[] text) {

@ -117,67 +117,92 @@ public final class htmlFilterWriter extends Writer {
} }
public static char[] genTag0raw(String tagname, boolean opening, char[] tagopts) { public static char[] genTag0raw(String tagname, boolean opening, char[] tagopts) {
serverCharBuffer bb = new serverCharBuffer(tagname.length() + tagopts.length + 3); try {
bb.append('<'); serverCharBuffer bb = new serverCharBuffer(tagname.length() + tagopts.length + 3);
if (!opening) { bb.append('<');
bb.append('/'); if (!opening) {
} bb.append('/');
bb.append(tagname); }
if (tagopts.length > 0) { bb.append(tagname);
// if (tagopts[0] == (byte) 32) if (tagopts.length > 0) {
bb.append(tagopts); // if (tagopts[0] == (byte) 32)
// else bb.append((byte) 32).append(tagopts); bb.append(tagopts);
// else bb.append((byte) 32).append(tagopts);
}
bb.append('>');
return bb.getChars();
} catch (IOException e) {
// ignore this
return null;
} }
bb.append('>');
return bb.getChars();
} }
public static char[] genTag1raw(String tagname, char[] tagopts, char[] text) { public static char[] genTag1raw(String tagname, char[] tagopts, char[] text) {
serverCharBuffer bb = new serverCharBuffer(2 * tagname.length() + tagopts.length + text.length + 5); try {
bb.append('<').append(tagname); serverCharBuffer bb = new serverCharBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
if (tagopts.length > 0) { bb.append('<').append(tagname);
// if (tagopts[0] == (byte) 32) if (tagopts.length > 0) {
bb.append(tagopts); // if (tagopts[0] == (byte) 32)
// else bb.append((byte) 32).append(tagopts); bb.append(tagopts);
// else bb.append((byte) 32).append(tagopts);
}
bb.append('>');
bb.append(text);
bb.append('<').append('/').append(tagname).append('>');
return bb.getChars();
} catch (IOException e) {
// ignore this
return null;
} }
bb.append('>');
bb.append(text);
bb.append('<').append('/').append(tagname).append('>');
return bb.getChars();
} }
public static char[] genTag0(String tagname, Properties tagopts, char quotechar) { public static char[] genTag0(String tagname, Properties tagopts, char quotechar) {
char[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar); try {
serverCharBuffer bb = new serverCharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2); char[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar);
bb.append('<').append(tagname); serverCharBuffer bb = new serverCharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
if (tagoptsx != null) { bb.append('<').append(tagname);
bb.append(32); if (tagoptsx != null) {
bb.append(tagoptsx); bb.append(32);
} bb.append(tagoptsx);
bb.append('>'); }
return bb.getChars(); bb.append('>');
return bb.getChars();
} catch (IOException e) {
// ignore this
return null;
}
} }
public static char[] genTag1(String tagname, Properties tagopts, char[] text, char quotechar) { public static char[] genTag1(String tagname, Properties tagopts, char[] text, char quotechar) {
char[] gt0 = genTag0(tagname, tagopts, quotechar); try {
serverCharBuffer cb = new serverCharBuffer(gt0, gt0.length + text.length + tagname.length() + 3); char[] gt0 = genTag0(tagname, tagopts, quotechar);
cb.append(text).append('<').append('/').append(tagname).append('>'); serverCharBuffer cb = new serverCharBuffer(gt0, gt0.length + text.length + tagname.length() + 3);
return cb.getChars(); cb.append(text).append('<').append('/').append(tagname).append('>');
return cb.getChars();
} catch (IOException e) {
// ignore this
return null;
}
} }
// a helper method for pretty-printing of properties for html tags // a helper method for pretty-printing of properties for html tags
public static char[] genOpts(Properties prop, char quotechar) { public static char[] genOpts(Properties prop, char quotechar) {
Enumeration e = prop.propertyNames(); try {
serverCharBuffer bb = new serverCharBuffer(prop.size() * 40); Enumeration e = prop.propertyNames();
String key; serverCharBuffer bb = new serverCharBuffer(prop.size() * 40);
while (e.hasMoreElements()) { String key;
key = (String) e.nextElement(); while (e.hasMoreElements()) {
bb.append(32).append(key).append('=').append(quotechar); key = (String) e.nextElement();
bb.append(prop.getProperty(key)); bb.append(32).append(key).append('=').append(quotechar);
bb.append(quotechar); bb.append(prop.getProperty(key));
bb.append(quotechar);
}
if (bb.length() > 0) return bb.getChars(1);
return bb.getChars();
}catch (IOException e) {
// ignore this
return null;
} }
if (bb.length() > 0) return bb.getChars(1);
return bb.getChars();
} }
private char[] filterTag(String tag, boolean opening, char[] content, char quotechar) { private char[] filterTag(String tag, boolean opening, char[] content, char quotechar) {

@ -598,6 +598,14 @@ public final class plasmaParser {
public plasmaParserDocument transformScraper(URL location, String mimeType, String charSet, htmlFilterContentScraper scraper) { public plasmaParserDocument transformScraper(URL location, String mimeType, String charSet, htmlFilterContentScraper scraper) {
try { try {
if (scraper.getMetas().containsKey("content-type")) {
String newCharset = (String) scraper.getMetas().get("content-type");
if (!charSet.equals(newCharset)) {
// TODO: transformation of content needed
this.theLogger.logFine("Charset transformation needed from '" + charSet + "' to '" + newCharset + "'");
}
}
String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0; int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];

@ -50,7 +50,7 @@ import java.io.IOException;
import java.io.Writer; import java.io.Writer;
import java.util.Properties; import java.util.Properties;
public final class serverCharBuffer /* extends Writer */ { public final class serverCharBuffer extends Writer {
public static final char singlequote = '\''; public static final char singlequote = '\'';
public static final char doublequote = '"'; public static final char doublequote = '"';
@ -164,10 +164,10 @@ public final class serverCharBuffer /* extends Writer */ {
length += le; length += le;
} }
public serverCharBuffer append(char b) { // public serverCharBuffer append(char b) {
write(b); // write(b);
return this; // return this;
} // }
public serverCharBuffer append(int i) { public serverCharBuffer append(int i) {
write((char) (i)); write((char) (i));

Loading…
Cancel
Save