parser refactoring & hacks

pull/1/head
Michael Peter Christen 13 years ago
parent 8a82609360
commit de903a53a0

@ -914,6 +914,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
this.port == other.port;
}
@Override
public int compareTo(final MultiProtocolURI h) {
return toString().compareTo(h.toString());
}
@ -1842,7 +1843,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
* Please call isHTTP(), isHTTPS() and isFTP() before using this class
*/
public java.net.URL getURL() throws MalformedURLException {
if (!(isHTTP() || isHTTPS() || isFTP())) throw new UnsupportedOperationException();
if (!(isHTTP() || isHTTPS() || isFTP())) throw new MalformedURLException();
return new java.net.URL(this.toNormalform(false, true));
}
@ -1850,8 +1851,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
* create a standard java File.
* Please call isFile() before using this class
*/
public java.io.File getFSFile() {
if (!isFile()) throw new UnsupportedOperationException();
public java.io.File getFSFile() throws MalformedURLException {
if (!isFile()) throw new MalformedURLException();
return new java.io.File(this.toNormalform(false, true).substring(7));
}
@ -1861,7 +1862,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
* @throws MalformedURLException
*/
public SmbFile getSmbFile() throws MalformedURLException {
if (!isSMB()) throw new UnsupportedOperationException();
if (!isSMB()) throw new MalformedURLException();
final String url = unescape(this.toNormalform(false, true));
return new SmbFile(url);
}

@ -281,9 +281,9 @@ public final class TextParser {
assert !parsers.isEmpty();
Document[] docs = null;
final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
if (MemoryControl.request(sourceArray.length * 6, false)) {
for (final Parser parser: parsers) {
final Map<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
for (final Parser parser: parsers) {
if (MemoryControl.request(sourceArray.length * 6, false)) {
ByteArrayInputStream bis;
if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) {
// a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages.

@ -36,6 +36,9 @@ import net.yacy.kelondro.util.MemoryControl;
public abstract class AbstractScraper implements Scraper {
protected static final String EMPTY_STRING = new String();
public static final char sp = ' ';
public static final char lb = '<';
public static final char rb = '>';
public static final char sl = '/';
@ -53,20 +56,25 @@ public abstract class AbstractScraper implements Scraper {
this.tags1 = tags1;
}
@Override
public boolean isTag0(final String tag) {
return (this.tags0 != null) && (this.tags0.contains(tag.toLowerCase()));
}
@Override
public boolean isTag1(final String tag) {
return (this.tags1 != null) && (this.tags1.contains(tag.toLowerCase()));
}
//the 'missing' method that shall be implemented:
@Override
public abstract void scrapeText(char[] text, String insideTag);
// the other methods must take into account to construct the return value correctly
@Override
public abstract void scrapeTag0(String tagname, Properties tagopts);
@Override
public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
protected static String stripAllTags(final char[] s) {
@ -76,7 +84,7 @@ public abstract class AbstractScraper implements Scraper {
for (final char c : s) {
if (c == lb) {
bc++;
r.append(' ');
if (r.length() > 0 && r.charAt(r.length() - 1) != sp) r.append(sp);
} else if (c == rb) {
bc--;
} else if (bc <= 0) {
@ -86,16 +94,42 @@ public abstract class AbstractScraper implements Scraper {
return r.toString().trim();
}
protected final static String cleanLine(final String s) {
if (!MemoryControl.request(s.length() * 2, false)) return EMPTY_STRING;
final StringBuilder sb = new StringBuilder(s.length());
char l = ' ';
char c;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
if (c < ' ') c = ' ';
if (c == ' ') {
if (l != ' ') sb.append(c);
} else {
sb.append(c);
}
l = c;
}
// return result
return sb.toString().trim();
}
public static String stripAll(final char[] s) {
return CharacterCoding.html2unicode(stripAllTags(s));
}
@Override
public void close() {
// free resources
this.tags0 = null;
this.tags1 = null;
}
public static void main(String[] args) {
String t = "<script src=\"navigation.js\" type=\"text/javascript\"></script>\\n <script src=\"../js/prototype.js\" type=\"text/javascript\"></script>";
System.out.println("'" + stripAllTags(t.toCharArray()) + "'");
}
}

@ -55,11 +55,9 @@ import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.ISO639;
import net.yacy.kelondro.util.MemoryControl;
public class ContentScraper extends AbstractScraper implements Scraper {
private static final String EMPTY_STRING = new String();
public static final int MAX_DOCSIZE = 40 * 1024 * 1024;
private final char degree = '\u00B0';
@ -364,7 +362,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
} else if (tagname.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tagopts.getProperty("title",EMPTY_STRING));
final String areatitle = cleanLine(tagopts.getProperty("title", EMPTY_STRING));
//String alt = tagopts.getProperty("alt",EMPTY_STRING);
final String href = tagopts.getProperty("href", EMPTY_STRING);
if (href.length() > 0) {
@ -539,26 +537,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return line;
}
private final static String cleanLine(final String s) {
if (!MemoryControl.request(s.length() * 2, false)) return EMPTY_STRING;
final StringBuilder sb = new StringBuilder(s.length());
char l = ' ';
char c;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
if (c < ' ') c = ' ';
if (c == ' ') {
if (l != ' ') sb.append(c);
} else {
sb.append(c);
}
l = c;
}
// return result
return sb.toString().trim();
}
public String getTitle() {
// construct a title string, even if the document has no title
@ -902,12 +880,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
System.out.println("TEXT :" + this.content.toString());
}
@Override
public void registerHtmlFilterEventListener(final ScraperListener listener) {
if (listener != null) {
this.htmlFilterEventListeners.add(ScraperListener.class, listener);
}
}
@Override
public void deregisterHtmlFilterEventListener(final ScraperListener listener) {
if (listener != null) {
this.htmlFilterEventListeners.remove(ScraperListener.class, listener);

@ -266,6 +266,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
this.resultStatus = ResultClass.SOURCE_WEB;
}
// parse the document to get all sentenced; available for snippet computation
Document document = null;
try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());

Loading…
Cancel
Save