From e0dc63202034266fc8442e9a52814155c7589d32 Mon Sep 17 00:00:00 2001 From: Michael Christen Date: Tue, 19 Jun 2018 00:42:23 +0200 Subject: [PATCH] removed transformer it was not used any more --- htroot/Crawler_p.java | 2 +- .../yacy/crawler/FileCrawlStarterTask.java | 2 +- source/net/yacy/data/BookmarkHelper.java | 2 +- .../parser/html/AbstractTransformer.java | 78 --------- .../document/parser/html/ContentScraper.java | 4 +- .../parser/html/ContentTransformer.java | 148 ------------------ .../parser/html/ScraperInputStream.java | 3 +- .../document/parser/html/Transformer.java | 59 ------- .../parser/html/TransformerWriter.java | 42 +---- .../net/yacy/document/parser/htmlParser.java | 4 +- 10 files changed, 16 insertions(+), 328 deletions(-) delete mode 100644 source/net/yacy/document/parser/html/AbstractTransformer.java delete mode 100644 source/net/yacy/document/parser/html/ContentTransformer.java delete mode 100644 source/net/yacy/document/parser/html/Transformer.java diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 197f0b0d1..3a2c4f377 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -806,7 +806,7 @@ public class Crawler_p { List hyperlinks_from_file; // check if the crawl filter works correctly final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet(), new VocabularyScraper(), timezoneOffset); - final Writer writer = new TransformerWriter(null, null, scraper, null, false); + final Writer writer = new TransformerWriter(null, null, scraper, false); if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) { /* Let's report here detailed error to help user when he selected a wrong file */ if(!crawlingFile.exists()) { diff --git a/source/net/yacy/crawler/FileCrawlStarterTask.java b/source/net/yacy/crawler/FileCrawlStarterTask.java index 3eabdd81f..2caec9740 100644 --- a/source/net/yacy/crawler/FileCrawlStarterTask.java +++ b/source/net/yacy/crawler/FileCrawlStarterTask.java @@ -143,7 +143,7 @@ public class FileCrawlStarterTask extends Thread { this.profile, true); this.scraper.registerHtmlFilterEventListener(anchorListener); - final Writer writer = new TransformerWriter(null, null, this.scraper, null, false); + final Writer writer = new TransformerWriter(null, null, this.scraper, false); FileInputStream inStream = null; try { diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java index 120d03766..a3836de04 100644 --- a/source/net/yacy/data/BookmarkHelper.java +++ b/source/net/yacy/data/BookmarkHelper.java @@ -137,7 +137,7 @@ public class BookmarkHelper { //load the links final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet(), new VocabularyScraper(), 0); //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); - final Writer writer = new TransformerWriter(null, null, scraper, null, false); + final Writer writer = new TransformerWriter(null, null, scraper, false); FileUtils.copy(input,writer); writer.close(); links = scraper.getAnchors(); diff --git a/source/net/yacy/document/parser/html/AbstractTransformer.java b/source/net/yacy/document/parser/html/AbstractTransformer.java deleted file mode 100644 index e1850c78d..000000000 --- a/source/net/yacy/document/parser/html/AbstractTransformer.java +++ /dev/null @@ -1,78 +0,0 @@ -// AbstractTransformer.java -// ---------------------------------- -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package net.yacy.document.parser.html; - -import java.util.TreeSet; - -@Deprecated -// TODO: delete candidate, because not in use, (noticed 2014-12-02) -public abstract class AbstractTransformer implements Transformer { - - private TreeSet tags0; - private TreeSet tags1; - - public AbstractTransformer(final TreeSet tags0, final TreeSet tags1) { - this.tags0 = tags0; - this.tags1 = tags1; - } - - @Override - public boolean isTag0(final String tag) { - return this.tags0.contains(tag); - } - - @Override - public boolean isTag1(final String tag) { - return this.tags1.contains(tag); - } - - //the 'missing' method that shall be implemented: - @Override - public abstract char[] transformText(char[] text); - /* could be easily implemented as: - { - return text; - } - */ - - // the other methods must take into account to construct the return value correctly - @Override - public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) { - return TransformerWriter.genTag0(tag.name, tag.opts, quotechar); - } - - @Override - public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) { - return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar); - } - - @Override - public synchronized void close() { - // free resources - this.tags0 = null; - this.tags1 = null; - } - -} diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 36fb1e34c..9455c988d 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -1603,14 +1603,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (page == null) throw new IOException("no content in file " + file.toString()); // scrape document to look up charset - final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet(), new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks, timezoneOffset); + final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet(), new VocabularyScraper(), new DigestURL("http://localhost"), false, maxLinks, timezoneOffset); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); htmlFilter.close(); if (charset == null) charset = Charset.defaultCharset().toString(); // scrape content final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet(), new VocabularyScraper(), timezoneOffset); - final Writer writer = new TransformerWriter(null, null, scraper, null, false); + final Writer writer = new TransformerWriter(null, null, scraper, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); writer.close(); return scraper; diff --git a/source/net/yacy/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java deleted file mode 100644 index d20be732e..000000000 --- a/source/net/yacy/document/parser/html/ContentTransformer.java +++ /dev/null @@ -1,148 +0,0 @@ -// ContentTransformer.java -// --------------------------------- -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package net.yacy.document.parser.html; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.TreeSet; - -import net.yacy.cora.document.encoding.ASCII; -import net.yacy.kelondro.io.CharBuffer; - -@Deprecated -// TODO: delete candidate, because not in use, (noticed 2014-12-02) -public class ContentTransformer extends AbstractTransformer implements Transformer { - - // statics: for initialization of the HTMLFilterAbstractTransformer - private static final TreeSet linkTags0 = new TreeSet(ASCII.insensitiveASCIIComparator); - private static final TreeSet linkTags1 = new TreeSet(ASCII.insensitiveASCIIComparator); - - static { - linkTags0.add("img"); - linkTags0.add("input"); - - linkTags1.add("a"); - } - - private ArrayList bluelist = null; - - public ContentTransformer() { - super(linkTags0, linkTags1); - } - - @Override - public void init(final String initarg) { - if (this.bluelist == null) { - // here, the init arg is used to load a list of blue-listed words - this.bluelist = new ArrayList(); - final File f = new File(initarg); - if (f.canRead()) { - try { - final BufferedReader r = new BufferedReader(new FileReader(f)); - String s; - while ((s = r.readLine()) != null) { - if (!s.isEmpty() && s.charAt(0) != '#') this.bluelist.add(s.toLowerCase()); - } - r.close(); - } catch (final IOException e) { - } - // if (bluelist.isEmpty()) System.out.println("BLUELIST is empty"); - } - } - } - - @Override - public boolean isIdentityTransformer() { - return this.bluelist.isEmpty(); - } - - private static char[] genBlueLetters(int length) { - final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, " ".toCharArray()); - length = length / 2; - if (length > 10) length = 7; - while (length-- > 0) { - bb.append('X'); - } - bb.append(" "); - final char[] result = bb.getChars(); - bb.close(); - return result; - } - - private boolean bluelistHit(final char[] text) { - if (text == null || this.bluelist == null) return false; - final String lc = new String(text).toLowerCase(); - for (int i = 0; i < this.bluelist.size(); i++) { - if (lc.indexOf(this.bluelist.get(i)) >= 0) return true; - } - return false; - } - - @Override - public char[] transformText(final char[] text) { - if (this.bluelist != null) { - if (bluelistHit(text)) { - // System.out.println("FILTERHIT: " + text); - return genBlueLetters(text.length); - } - return text; - } - return text; - } - - @Override - public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) { - if (tag.name.equals("img")) { - // check bluelist - if (bluelistHit(tag.opts.getProperty("src", "").toCharArray())) return genBlueLetters(5); - if (bluelistHit(tag.opts.getProperty("alt", "").toCharArray())) return genBlueLetters(5); - - // replace image alternative name - tag.opts.setProperty("alt", new String(transformText(tag.opts.getProperty("alt", "").toCharArray()))); - } - if (tag.name.equals("input") && (tag.opts.getProperty("type") != null && tag.opts.getProperty("type").equals("submit"))) { - // rewrite button name - tag.opts.setProperty("value", new String(transformText(tag.opts.getProperty("value", "").toCharArray()))); - } - return TransformerWriter.genTag0(tag.name, tag.opts, quotechar); - } - - @Override - public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) { - if (bluelistHit(tag.opts.getProperty("href","").toCharArray())) return genBlueLetters(tag.content.length()); - if (bluelistHit(tag.content.getChars())) return genBlueLetters(tag.content.length()); - return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar); - } - - @Override - public synchronized void close() { - // free resources - super.close(); - } - -} diff --git a/source/net/yacy/document/parser/html/ScraperInputStream.java b/source/net/yacy/document/parser/html/ScraperInputStream.java index 68a2602e6..776891eca 100644 --- a/source/net/yacy/document/parser/html/ScraperInputStream.java +++ b/source/net/yacy/document/parser/html/ScraperInputStream.java @@ -65,7 +65,6 @@ public class ScraperInputStream extends InputStream implements ScraperListener { final Set ignore_class_name, final VocabularyScraper vocabularyScraper, final DigestURL rooturl, - final Transformer transformer, final boolean passbyIfBinarySuspect, final int maxLinks, final int timezoneOffset @@ -82,7 +81,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener { } catch (final UnsupportedEncodingException e) { this.reader = new InputStreamReader(this, StandardCharsets.UTF_8); } - this.writer = new TransformerWriter(null,null,scraper,transformer,passbyIfBinarySuspect); + this.writer = new TransformerWriter(null,null,scraper,passbyIfBinarySuspect); } private static String extractCharsetFromMimetypeHeader(final String mimeType) { diff --git a/source/net/yacy/document/parser/html/Transformer.java b/source/net/yacy/document/parser/html/Transformer.java deleted file mode 100644 index 9b605340e..000000000 --- a/source/net/yacy/document/parser/html/Transformer.java +++ /dev/null @@ -1,59 +0,0 @@ -// Transformer.java -// --------------------------- -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package net.yacy.document.parser.html; - -public interface Transformer { - - // the init method is used to initialize the transformer with some values - // i.e. the initarg - String can be the name of a file which may contain - // more specific transformation rules - public void init(String initarg); - - // ask if this transformer will do any transformation whatsoever - // this may return true if the initialization resulted in a status - // that does not allow any transformation - public boolean isIdentityTransformer(); - - // tests, if a given body-less tag (i.e.
shall be supervised) - // only tags that are defined here will be cached and not streamed - public boolean isTag0(String tag); - - // tests if a given tag that may have a body (i.e. ..body.. ) - // shall be supervised - public boolean isTag1(String tag); - - // method that is called with any text between tags - // the returned text replaces the given text - // if the text shall not be changed, it must be returned as called - public char[] transformText(char[] text); - - // method that is called when a body-less tag occurs - public char[] transformTag0(ContentScraper.Tag tag, char quotechar); - - // method that is called when a body-containing text occurs - public char[] transformTag1(ContentScraper.Tag tag, char quotechar); - - public void close(); -} diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index 1bf300e5e..a6d6a9189 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -59,7 +59,6 @@ public final class TransformerWriter extends Writer { private CharBuffer buffer; private Stack tagStack; private final Scraper scraper; - private final Transformer transformer; private boolean inSingleQuote; private boolean inDoubleQuote; private boolean inComment; @@ -70,23 +69,20 @@ public final class TransformerWriter extends Writer { final OutputStream outStream, final Charset charSet, final Scraper scraper, - final Transformer transformer, final boolean passbyIfBinarySuspect ) { - this(outStream, charSet, scraper, transformer, passbyIfBinarySuspect, 64); + this(outStream, charSet, scraper, passbyIfBinarySuspect, 64); } public TransformerWriter( final OutputStream outStream, final Charset charSet, final Scraper scraper, - final Transformer transformer, final boolean passbyIfBinarySuspect, final int initialBufferSize ) { this.outStream = outStream; this.scraper = scraper; - this.transformer = transformer; this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize); this.tagStack = new Stack(); this.inSingleQuote = false; @@ -235,9 +231,6 @@ public final class TransformerWriter extends Writer { if (this.scraper != null && content.length > 0) { this.scraper.scrapeText(content, null); } - if (this.transformer != null) { - return this.transformer.transformText(content); - } return content; } @@ -246,11 +239,7 @@ public final class TransformerWriter extends Writer { if (this.scraper != null) { this.scraper.scrapeText(content, this.tagStack.lastElement()); } - if (this.transformer != null) { - this.tagStack.lastElement().content.append(this.transformer.transformText(content)); - } else { - this.tagStack.lastElement().content.append(content); - } + this.tagStack.lastElement().content.append(content); return new char[0]; } @@ -318,32 +307,21 @@ public final class TransformerWriter extends Writer { // this single tag is collected at once here this.scraper.scrapeTag0(tag); } - if (this.transformer != null && this.transformer.isTag0(tagname)) { - // this single tag is collected at once here - char[] b = this.transformer.transformTag0(tag, quotechar); - return b; - } else if ((this.scraper != null && this.scraper.isTag1(tagname)) || - (this.transformer != null && this.transformer.isTag1(tagname))) { + if (this.scraper != null && this.scraper.isTag1(tagname)) { // ok, start collecting; we don't push this here to the scraper or transformer; we do that when the tag is closed. this.tagStack.push(tag); return new char[0]; - } else { - // we ignore that thing and return it again - return genTag0raw(tagname, true, content); } + // we ignore that thing and return it again + return genTag0raw(tagname, true, content); } private char[] filterTagCloseing(final char quotechar) { char[] ret; ContentScraper.Tag tag = this.tagStack.lastElement(); if (this.scraper != null) this.scraper.scrapeTag1(tag); - if (this.transformer != null) { - ret = this.transformer.transformTag1(tag, quotechar); - } else { - ret = genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar); - } - if ((this.scraper != null && this.scraper.isTag1(tag.name)) || - (this.transformer != null && this.transformer.isTag1(tag.name))) { + ret = genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar); + if (this.scraper != null && this.scraper.isTag1(tag.name)) { // remove the tag from the stack as soon as the tag is processed this.tagStack.pop(); // at this point the characters from the recently processed tag must be attached to the previous tag @@ -360,11 +338,7 @@ public final class TransformerWriter extends Writer { // it's our closing tag! return complete result. char[] ret; if (this.scraper != null) this.scraper.scrapeTag1(this.tagStack.lastElement()); - if (this.transformer != null) { - ret = this.transformer.transformTag1(this.tagStack.lastElement(), quotechar); - } else { - ret = genTag1(this.tagStack.lastElement().name, this.tagStack.lastElement().opts, this.tagStack.lastElement().content.getChars(), quotechar); - } + ret = genTag1(this.tagStack.lastElement().name, this.tagStack.lastElement().opts, this.tagStack.lastElement().content.getChars(), quotechar); this.tagStack.pop(); return ret; } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 0b0ee1296..9b0f5c3de 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -276,7 +276,7 @@ public class htmlParser extends AbstractParser implements Parser { if (charset == null) { ScraperInputStream htmlFilter = null; try { - htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, null, false, maxLinks, timezoneOffset); + htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, false, maxLinks, timezoneOffset); sourceStream = htmlFilter; charset = htmlFilter.detectCharset(); } catch (final IOException e1) { @@ -312,7 +312,7 @@ public class htmlParser extends AbstractParser implements Parser { // parsing the content // for this static method no need to init local this.scraperObject here final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset); - final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); + final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available()))); try { final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte()); final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);