removed transformer

it was not used any more
pull/186/head
Michael Christen 7 years ago
parent 495ca57f61
commit e0dc632020

@ -806,7 +806,7 @@ public class Crawler_p {
List<AnchorURL> hyperlinks_from_file;
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, false);
if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/* Let's report here detailed error to help user when he selected a wrong file */
if(!crawlingFile.exists()) {

@ -143,7 +143,7 @@ public class FileCrawlStarterTask extends Thread {
this.profile, true);
this.scraper.registerHtmlFilterEventListener(anchorListener);
final Writer writer = new TransformerWriter(null, null, this.scraper, null, false);
final Writer writer = new TransformerWriter(null, null, this.scraper, false);
FileInputStream inStream = null;
try {

@ -137,7 +137,7 @@ public class BookmarkHelper {
//load the links
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), new VocabularyScraper(), 0);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, false);
FileUtils.copy(input,writer);
writer.close();
links = scraper.getAnchors();

@ -1,78 +0,0 @@
// AbstractTransformer.java
// ----------------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser.html;
import java.util.TreeSet;
@Deprecated
// TODO: delete candidate, because not in use, (noticed 2014-12-02)
public abstract class AbstractTransformer implements Transformer {
private TreeSet<String> tags0;
private TreeSet<String> tags1;
public AbstractTransformer(final TreeSet<String> tags0, final TreeSet<String> tags1) {
this.tags0 = tags0;
this.tags1 = tags1;
}
@Override
public boolean isTag0(final String tag) {
return this.tags0.contains(tag);
}
@Override
public boolean isTag1(final String tag) {
return this.tags1.contains(tag);
}
//the 'missing' method that shall be implemented:
@Override
public abstract char[] transformText(char[] text);
/* could be easily implemented as:
{
return text;
}
*/
// the other methods must take into account to construct the return value correctly
@Override
public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
}
@Override
public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
}
@Override
public synchronized void close() {
// free resources
this.tags0 = null;
this.tags1 = null;
}
}

@ -1603,14 +1603,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks, timezoneOffset);
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), new DigestURL("http://localhost"), false, maxLinks, timezoneOffset);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close();
return scraper;

@ -1,148 +0,0 @@
// ContentTransformer.java
// ---------------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser.html;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.TreeSet;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.kelondro.io.CharBuffer;
@Deprecated
// TODO: delete candidate, because not in use, (noticed 2014-12-02)
public class ContentTransformer extends AbstractTransformer implements Transformer {
// statics: for initialization of the HTMLFilterAbstractTransformer
private static final TreeSet<String> linkTags0 = new TreeSet<String>(ASCII.insensitiveASCIIComparator);
private static final TreeSet<String> linkTags1 = new TreeSet<String>(ASCII.insensitiveASCIIComparator);
static {
linkTags0.add("img");
linkTags0.add("input");
linkTags1.add("a");
}
private ArrayList<String> bluelist = null;
public ContentTransformer() {
super(linkTags0, linkTags1);
}
@Override
public void init(final String initarg) {
if (this.bluelist == null) {
// here, the init arg is used to load a list of blue-listed words
this.bluelist = new ArrayList<String>();
final File f = new File(initarg);
if (f.canRead()) {
try {
final BufferedReader r = new BufferedReader(new FileReader(f));
String s;
while ((s = r.readLine()) != null) {
if (!s.isEmpty() && s.charAt(0) != '#') this.bluelist.add(s.toLowerCase());
}
r.close();
} catch (final IOException e) {
}
// if (bluelist.isEmpty()) System.out.println("BLUELIST is empty");
}
}
}
@Override
public boolean isIdentityTransformer() {
return this.bluelist.isEmpty();
}
private static char[] genBlueLetters(int length) {
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, " <FONT COLOR=#0000FF>".toCharArray());
length = length / 2;
if (length > 10) length = 7;
while (length-- > 0) {
bb.append('X');
}
bb.append("</FONT> ");
final char[] result = bb.getChars();
bb.close();
return result;
}
private boolean bluelistHit(final char[] text) {
if (text == null || this.bluelist == null) return false;
final String lc = new String(text).toLowerCase();
for (int i = 0; i < this.bluelist.size(); i++) {
if (lc.indexOf(this.bluelist.get(i)) >= 0) return true;
}
return false;
}
@Override
public char[] transformText(final char[] text) {
if (this.bluelist != null) {
if (bluelistHit(text)) {
// System.out.println("FILTERHIT: " + text);
return genBlueLetters(text.length);
}
return text;
}
return text;
}
@Override
public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
if (tag.name.equals("img")) {
// check bluelist
if (bluelistHit(tag.opts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
if (bluelistHit(tag.opts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);
// replace image alternative name
tag.opts.setProperty("alt", new String(transformText(tag.opts.getProperty("alt", "").toCharArray())));
}
if (tag.name.equals("input") && (tag.opts.getProperty("type") != null && tag.opts.getProperty("type").equals("submit"))) {
// rewrite button name
tag.opts.setProperty("value", new String(transformText(tag.opts.getProperty("value", "").toCharArray())));
}
return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
}
@Override
public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
if (bluelistHit(tag.opts.getProperty("href","").toCharArray())) return genBlueLetters(tag.content.length());
if (bluelistHit(tag.content.getChars())) return genBlueLetters(tag.content.length());
return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
}
@Override
public synchronized void close() {
// free resources
super.close();
}
}

@ -65,7 +65,6 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
final Set<String> ignore_class_name,
final VocabularyScraper vocabularyScraper,
final DigestURL rooturl,
final Transformer transformer,
final boolean passbyIfBinarySuspect,
final int maxLinks,
final int timezoneOffset
@ -82,7 +81,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
} catch (final UnsupportedEncodingException e) {
this.reader = new InputStreamReader(this, StandardCharsets.UTF_8);
}
this.writer = new TransformerWriter(null,null,scraper,transformer,passbyIfBinarySuspect);
this.writer = new TransformerWriter(null,null,scraper,passbyIfBinarySuspect);
}
private static String extractCharsetFromMimetypeHeader(final String mimeType) {

@ -1,59 +0,0 @@
// Transformer.java
// ---------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser.html;
public interface Transformer {
// the init method is used to initialize the transformer with some values
// i.e. the initarg - String can be the name of a file which may contain
// more specific transformation rules
public void init(String initarg);
// ask if this transformer will do any transformation whatsoever
// this may return true if the initialization resulted in a status
// that does not allow any transformation
public boolean isIdentityTransformer();
// tests, if a given body-less tag (i.e. <br> shall be supervised)
// only tags that are defined here will be cached and not streamed
public boolean isTag0(String tag);
// tests if a given tag that may have a body (i.e. <tt> ..body.. </tt>)
// shall be supervised
public boolean isTag1(String tag);
// method that is called with any text between tags
// the returned text replaces the given text
// if the text shall not be changed, it must be returned as called
public char[] transformText(char[] text);
// method that is called when a body-less tag occurs
public char[] transformTag0(ContentScraper.Tag tag, char quotechar);
// method that is called when a body-containing text occurs
public char[] transformTag1(ContentScraper.Tag tag, char quotechar);
public void close();
}

@ -59,7 +59,6 @@ public final class TransformerWriter extends Writer {
private CharBuffer buffer;
private Stack<ContentScraper.Tag> tagStack;
private final Scraper scraper;
private final Transformer transformer;
private boolean inSingleQuote;
private boolean inDoubleQuote;
private boolean inComment;
@ -70,23 +69,20 @@ public final class TransformerWriter extends Writer {
final OutputStream outStream,
final Charset charSet,
final Scraper scraper,
final Transformer transformer,
final boolean passbyIfBinarySuspect
) {
this(outStream, charSet, scraper, transformer, passbyIfBinarySuspect, 64);
this(outStream, charSet, scraper, passbyIfBinarySuspect, 64);
}
public TransformerWriter(
final OutputStream outStream,
final Charset charSet,
final Scraper scraper,
final Transformer transformer,
final boolean passbyIfBinarySuspect,
final int initialBufferSize
) {
this.outStream = outStream;
this.scraper = scraper;
this.transformer = transformer;
this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize);
this.tagStack = new Stack<ContentScraper.Tag>();
this.inSingleQuote = false;
@ -235,9 +231,6 @@ public final class TransformerWriter extends Writer {
if (this.scraper != null && content.length > 0) {
this.scraper.scrapeText(content, null);
}
if (this.transformer != null) {
return this.transformer.transformText(content);
}
return content;
}
@ -246,11 +239,7 @@ public final class TransformerWriter extends Writer {
if (this.scraper != null) {
this.scraper.scrapeText(content, this.tagStack.lastElement());
}
if (this.transformer != null) {
this.tagStack.lastElement().content.append(this.transformer.transformText(content));
} else {
this.tagStack.lastElement().content.append(content);
}
this.tagStack.lastElement().content.append(content);
return new char[0];
}
@ -318,32 +307,21 @@ public final class TransformerWriter extends Writer {
// this single tag is collected at once here
this.scraper.scrapeTag0(tag);
}
if (this.transformer != null && this.transformer.isTag0(tagname)) {
// this single tag is collected at once here
char[] b = this.transformer.transformTag0(tag, quotechar);
return b;
} else if ((this.scraper != null && this.scraper.isTag1(tagname)) ||
(this.transformer != null && this.transformer.isTag1(tagname))) {
if (this.scraper != null && this.scraper.isTag1(tagname)) {
// ok, start collecting; we don't push this here to the scraper or transformer; we do that when the tag is closed.
this.tagStack.push(tag);
return new char[0];
} else {
// we ignore that thing and return it again
return genTag0raw(tagname, true, content);
}
// we ignore that thing and return it again
return genTag0raw(tagname, true, content);
}
private char[] filterTagCloseing(final char quotechar) {
char[] ret;
ContentScraper.Tag tag = this.tagStack.lastElement();
if (this.scraper != null) this.scraper.scrapeTag1(tag);
if (this.transformer != null) {
ret = this.transformer.transformTag1(tag, quotechar);
} else {
ret = genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
}
if ((this.scraper != null && this.scraper.isTag1(tag.name)) ||
(this.transformer != null && this.transformer.isTag1(tag.name))) {
ret = genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
if (this.scraper != null && this.scraper.isTag1(tag.name)) {
// remove the tag from the stack as soon as the tag is processed
this.tagStack.pop();
// at this point the characters from the recently processed tag must be attached to the previous tag
@ -360,11 +338,7 @@ public final class TransformerWriter extends Writer {
// it's our closing tag! return complete result.
char[] ret;
if (this.scraper != null) this.scraper.scrapeTag1(this.tagStack.lastElement());
if (this.transformer != null) {
ret = this.transformer.transformTag1(this.tagStack.lastElement(), quotechar);
} else {
ret = genTag1(this.tagStack.lastElement().name, this.tagStack.lastElement().opts, this.tagStack.lastElement().content.getChars(), quotechar);
}
ret = genTag1(this.tagStack.lastElement().name, this.tagStack.lastElement().opts, this.tagStack.lastElement().content.getChars(), quotechar);
this.tagStack.pop();
return ret;
}

@ -276,7 +276,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (charset == null) {
ScraperInputStream htmlFilter = null;
try {
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, null, false, maxLinks, timezoneOffset);
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, false, maxLinks, timezoneOffset);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
} catch (final IOException e1) {
@ -312,7 +312,7 @@ public class htmlParser extends AbstractParser implements Parser {
// parsing the content
// for this static method no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);

Loading…
Cancel
Save