pass parsed swf content trough htmlscraper

Swf may contain subset of html tags which shoul'd appear as text.
Especially <font> tag may totally screw up metadata servlet if not filtered out.
pull/41/merge
reger 9 years ago
parent 18ecf57792
commit 1f18653de0

@ -30,15 +30,13 @@ package net.yacy.document.parser;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import pt.tumba.parser.swf.SWF2HTML; import pt.tumba.parser.swf.SWF2HTML;
public class swfParser extends AbstractParser implements Parser { public class swfParser extends AbstractParser implements Parser {
@ -70,8 +68,10 @@ public class swfParser extends AbstractParser implements Parser {
try { try {
final SWF2HTML swf2html = new SWF2HTML(); final SWF2HTML swf2html = new SWF2HTML();
String contents = ""; String contents = "";
ContentScraper htmlscraper=null;
try { try {
contents = swf2html.convertSWFToHTML(source); contents = swf2html.convertSWFToHTML(source);
htmlscraper = htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100);
} catch (final NegativeArraySizeException e) { } catch (final NegativeArraySizeException e) {
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
} catch (final IOException e) { } catch (final IOException e) {
@ -79,6 +79,7 @@ public class swfParser extends AbstractParser implements Parser {
} catch (final Exception e) { } catch (final Exception e) {
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
} }
/*
String url = null; String url = null;
String urlnr = null; String urlnr = null;
final String linebreak = System.getProperty("line.separator"); final String linebreak = System.getProperty("line.separator");
@ -88,12 +89,6 @@ public class swfParser extends AbstractParser implements Parser {
int urlEnd = 0; int urlEnd = 0;
int p0 = 0; int p0 = 0;
//getting rid of HTML-Tags
p0 = contents.indexOf("<html><body>",0);
contents = contents.substring(p0+12);
p0 = contents.indexOf("</body></html>",0);
contents = contents.substring(0,p0);
//extracting urls //extracting urls
while ((urlStart = contents.indexOf("http://",urlEnd)) >= 0){ while ((urlStart = contents.indexOf("http://",urlEnd)) >= 0){
urlEnd = contents.indexOf(linebreak,urlStart); urlEnd = contents.indexOf(linebreak,urlStart);
@ -104,31 +99,28 @@ public class swfParser extends AbstractParser implements Parser {
anchors.add(u); anchors.add(u);
contents = contents.substring(0,urlStart)+contents.substring(urlEnd); contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
} }
*/
// As the result of parsing this function must return a plasmaParserDocument object // As the result of parsing this function must return a plasmaParserDocument object
return new Document[]{new Document( return new Document[]{new Document(
location, // url of the source document location, // url of the source document
mimeType, // the documents mime type mimeType, // the documents mime type
StandardCharsets.UTF_8.name(), // charset of the document text StandardCharsets.UTF_8.name(), // charset of the document text
this, this,
null, htmlscraper.getContentLanguages(),
null, //keywords htmlscraper.getKeywords(),
singleList(((contents.length() > 80)? contents.substring(0, 80):contents.trim()). htmlscraper.getTitles(),
replaceAll("\r\n"," "). htmlscraper.getAuthor(),
replaceAll("\n"," "). htmlscraper.getPublisher(),
replaceAll("\r"," "). null, // sections
replaceAll("\t"," ")), // title htmlscraper.getDescriptions(),
null, // TODO: AUTHOR htmlscraper.getLon(), htmlscraper.getLat(),
null, htmlscraper.getText(),
null, // an array of section headlines htmlscraper.getAnchors(),
null, // an abstract htmlscraper.getRSS(),
0.0d, 0.0d, null, // images
contents, // the parsed document text false,
anchors, // a map of extracted anchors htmlscraper.getDate())};
null,
null,
false,
new Date())};
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;

Loading…
Cancel
Save