From 1f18653de0b2bb074f26769c4268a83ada40185d Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 21 Jan 2016 02:55:05 +0100 Subject: [PATCH] pass parsed swf content trough htmlscraper Swf may contain subset of html tags which shoul'd appear as text. Especially tag may totally screw up metadata servlet if not filtered out. --- .../net/yacy/document/parser/swfParser.java | 54 ++++++++----------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 84cb17b06..a25efa74f 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -30,15 +30,13 @@ package net.yacy.document.parser; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Date; -import java.util.List; import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.ContentScraper; import pt.tumba.parser.swf.SWF2HTML; public class swfParser extends AbstractParser implements Parser { @@ -70,8 +68,10 @@ public class swfParser extends AbstractParser implements Parser { try { final SWF2HTML swf2html = new SWF2HTML(); String contents = ""; + ContentScraper htmlscraper=null; try { contents = swf2html.convertSWFToHTML(source); + htmlscraper = htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100); } catch (final NegativeArraySizeException e) { throw new Parser.Failure(e.getMessage(), location); } catch (final IOException e) { @@ -79,6 +79,7 @@ public class swfParser extends AbstractParser implements Parser { } catch (final Exception e) { throw new Parser.Failure(e.getMessage(), location); } + /* String url = null; String urlnr = null; final String linebreak = System.getProperty("line.separator"); @@ -88,12 +89,6 @@ public class swfParser extends AbstractParser implements Parser { int urlEnd = 0; int p0 = 0; - //getting rid of HTML-Tags - p0 = contents.indexOf("",0); - contents = contents.substring(p0+12); - p0 = contents.indexOf("",0); - contents = contents.substring(0,p0); - //extracting urls while ((urlStart = contents.indexOf("http://",urlEnd)) >= 0){ urlEnd = contents.indexOf(linebreak,urlStart); @@ -104,31 +99,28 @@ public class swfParser extends AbstractParser implements Parser { anchors.add(u); contents = contents.substring(0,urlStart)+contents.substring(urlEnd); } + */ // As the result of parsing this function must return a plasmaParserDocument object return new Document[]{new Document( - location, // url of the source document - mimeType, // the documents mime type - StandardCharsets.UTF_8.name(), // charset of the document text - this, - null, - null, //keywords - singleList(((contents.length() > 80)? contents.substring(0, 80):contents.trim()). - replaceAll("\r\n"," "). - replaceAll("\n"," "). - replaceAll("\r"," "). - replaceAll("\t"," ")), // title - null, // TODO: AUTHOR - null, - null, // an array of section headlines - null, // an abstract - 0.0d, 0.0d, - contents, // the parsed document text - anchors, // a map of extracted anchors - null, - null, - false, - new Date())}; + location, // url of the source document + mimeType, // the documents mime type + StandardCharsets.UTF_8.name(), // charset of the document text + this, + htmlscraper.getContentLanguages(), + htmlscraper.getKeywords(), + htmlscraper.getTitles(), + htmlscraper.getAuthor(), + htmlscraper.getPublisher(), + null, // sections + htmlscraper.getDescriptions(), + htmlscraper.getLon(), htmlscraper.getLat(), + htmlscraper.getText(), + htmlscraper.getAnchors(), + htmlscraper.getRSS(), + null, // images + false, + htmlscraper.getDate())}; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e;