From 1f18653de0b2bb074f26769c4268a83ada40185d Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Thu, 21 Jan 2016 02:55:05 +0100
Subject: [PATCH] pass parsed swf content trough htmlscraper Swf may contain
 subset of html tags which shoul'd appear as text. Especially <font> tag may
 totally screw up metadata servlet if not filtered out.

---
 .../net/yacy/document/parser/swfParser.java   | 54 ++++++++-----------
 1 file changed, 23 insertions(+), 31 deletions(-)
diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java
index 84cb17b06..a25efa74f 100644
--- a/source/net/yacy/document/parser/swfParser.java
+++ b/source/net/yacy/document/parser/swfParser.java
@@ -30,15 +30,13 @@ package net.yacy.document.parser;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.List;
 
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.ContentScraper;
 import pt.tumba.parser.swf.SWF2HTML;
 
 public class swfParser extends AbstractParser implements Parser {
@@ -70,8 +68,10 @@ public class swfParser extends AbstractParser implements Parser {
         try {
             final SWF2HTML swf2html = new SWF2HTML();
             String contents = "";
+            ContentScraper htmlscraper=null;
             try {
             	contents = swf2html.convertSWFToHTML(source);
+                htmlscraper =  htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100);
             } catch (final NegativeArraySizeException e) {
                 throw new Parser.Failure(e.getMessage(), location);
             } catch (final IOException e) {
@@ -79,6 +79,7 @@ public class swfParser extends AbstractParser implements Parser {
             } catch (final Exception e) {
                 throw new Parser.Failure(e.getMessage(), location);
             }
+            /*
             String url = null;
             String urlnr = null;
             final String linebreak = System.getProperty("line.separator");
@@ -88,12 +89,6 @@ public class swfParser extends AbstractParser implements Parser {
             int urlEnd = 0;
             int p0 = 0;
 
-            //getting rid of HTML-Tags
-            p0 = contents.indexOf("<html><body>",0);
-            contents = contents.substring(p0+12);
-            p0 = contents.indexOf("</body></html>",0);
-            contents = contents.substring(0,p0);
-
             //extracting urls
             while ((urlStart = contents.indexOf("http://",urlEnd)) >= 0){
                 urlEnd = contents.indexOf(linebreak,urlStart);
@@ -104,31 +99,28 @@ public class swfParser extends AbstractParser implements Parser {
                 anchors.add(u);
                 contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
             }
+            */
 
            // As the result of parsing this function must return a plasmaParserDocument object
             return new Document[]{new Document(
-                    location,     // url of the source document
-                    mimeType,     // the documents mime type
-                    StandardCharsets.UTF_8.name(),      // charset of the document text
-                    this,
-                    null,
-                    null,          //keywords
-                    singleList(((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
-                          replaceAll("\r\n"," ").
-                          replaceAll("\n"," ").
-                          replaceAll("\r"," ").
-                          replaceAll("\t"," ")), // title
-                    null, // TODO: AUTHOR
-                    null,
-                    null,        // an array of section headlines
-                    null,        // an abstract
-                    0.0d, 0.0d,
-                    contents,     // the parsed document text
-                    anchors,      // a map of extracted anchors
-                    null,
-                    null,
-                    false,
-                    new Date())};
+                location, // url of the source document
+                mimeType, // the documents mime type
+                StandardCharsets.UTF_8.name(), // charset of the document text
+                this,
+                htmlscraper.getContentLanguages(),
+                htmlscraper.getKeywords(),
+                htmlscraper.getTitles(),
+                htmlscraper.getAuthor(),
+                htmlscraper.getPublisher(),
+                null, // sections
+                htmlscraper.getDescriptions(),
+                htmlscraper.getLon(), htmlscraper.getLat(),
+                htmlscraper.getText(),
+                htmlscraper.getAnchors(),
+                htmlscraper.getRSS(),
+                null, // images
+                false,
+                htmlscraper.getDate())};
         } catch (final Exception e) {
             if (e instanceof InterruptedException) throw (InterruptedException) e;