parent
495ca57f61
commit
e0dc632020
@ -1,78 +0,0 @@
|
||||
// AbstractTransformer.java
|
||||
// ----------------------------------
|
||||
// (C) by Michael Peter Christen; mc@yacy.net
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.document.parser.html;
|
||||
|
||||
import java.util.TreeSet;
|
||||
|
||||
@Deprecated
|
||||
// TODO: delete candidate, because not in use, (noticed 2014-12-02)
|
||||
public abstract class AbstractTransformer implements Transformer {
|
||||
|
||||
private TreeSet<String> tags0;
|
||||
private TreeSet<String> tags1;
|
||||
|
||||
public AbstractTransformer(final TreeSet<String> tags0, final TreeSet<String> tags1) {
|
||||
this.tags0 = tags0;
|
||||
this.tags1 = tags1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isTag0(final String tag) {
|
||||
return this.tags0.contains(tag);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isTag1(final String tag) {
|
||||
return this.tags1.contains(tag);
|
||||
}
|
||||
|
||||
//the 'missing' method that shall be implemented:
|
||||
@Override
|
||||
public abstract char[] transformText(char[] text);
|
||||
/* could be easily implemented as:
|
||||
{
|
||||
return text;
|
||||
}
|
||||
*/
|
||||
|
||||
// the other methods must take into account to construct the return value correctly
|
||||
@Override
|
||||
public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
|
||||
return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
|
||||
}
|
||||
|
||||
@Override
|
||||
public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
|
||||
return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void close() {
|
||||
// free resources
|
||||
this.tags0 = null;
|
||||
this.tags1 = null;
|
||||
}
|
||||
|
||||
}
|
@ -1,148 +0,0 @@
|
||||
// ContentTransformer.java
|
||||
// ---------------------------------
|
||||
// (C) by Michael Peter Christen; mc@yacy.net
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.document.parser.html;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.kelondro.io.CharBuffer;
|
||||
|
||||
@Deprecated
|
||||
// TODO: delete candidate, because not in use, (noticed 2014-12-02)
|
||||
public class ContentTransformer extends AbstractTransformer implements Transformer {
|
||||
|
||||
// statics: for initialization of the HTMLFilterAbstractTransformer
|
||||
private static final TreeSet<String> linkTags0 = new TreeSet<String>(ASCII.insensitiveASCIIComparator);
|
||||
private static final TreeSet<String> linkTags1 = new TreeSet<String>(ASCII.insensitiveASCIIComparator);
|
||||
|
||||
static {
|
||||
linkTags0.add("img");
|
||||
linkTags0.add("input");
|
||||
|
||||
linkTags1.add("a");
|
||||
}
|
||||
|
||||
private ArrayList<String> bluelist = null;
|
||||
|
||||
public ContentTransformer() {
|
||||
super(linkTags0, linkTags1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(final String initarg) {
|
||||
if (this.bluelist == null) {
|
||||
// here, the init arg is used to load a list of blue-listed words
|
||||
this.bluelist = new ArrayList<String>();
|
||||
final File f = new File(initarg);
|
||||
if (f.canRead()) {
|
||||
try {
|
||||
final BufferedReader r = new BufferedReader(new FileReader(f));
|
||||
String s;
|
||||
while ((s = r.readLine()) != null) {
|
||||
if (!s.isEmpty() && s.charAt(0) != '#') this.bluelist.add(s.toLowerCase());
|
||||
}
|
||||
r.close();
|
||||
} catch (final IOException e) {
|
||||
}
|
||||
// if (bluelist.isEmpty()) System.out.println("BLUELIST is empty");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isIdentityTransformer() {
|
||||
return this.bluelist.isEmpty();
|
||||
}
|
||||
|
||||
private static char[] genBlueLetters(int length) {
|
||||
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, " <FONT COLOR=#0000FF>".toCharArray());
|
||||
length = length / 2;
|
||||
if (length > 10) length = 7;
|
||||
while (length-- > 0) {
|
||||
bb.append('X');
|
||||
}
|
||||
bb.append("</FONT> ");
|
||||
final char[] result = bb.getChars();
|
||||
bb.close();
|
||||
return result;
|
||||
}
|
||||
|
||||
private boolean bluelistHit(final char[] text) {
|
||||
if (text == null || this.bluelist == null) return false;
|
||||
final String lc = new String(text).toLowerCase();
|
||||
for (int i = 0; i < this.bluelist.size(); i++) {
|
||||
if (lc.indexOf(this.bluelist.get(i)) >= 0) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public char[] transformText(final char[] text) {
|
||||
if (this.bluelist != null) {
|
||||
if (bluelistHit(text)) {
|
||||
// System.out.println("FILTERHIT: " + text);
|
||||
return genBlueLetters(text.length);
|
||||
}
|
||||
return text;
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
|
||||
if (tag.name.equals("img")) {
|
||||
// check bluelist
|
||||
if (bluelistHit(tag.opts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
|
||||
if (bluelistHit(tag.opts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);
|
||||
|
||||
// replace image alternative name
|
||||
tag.opts.setProperty("alt", new String(transformText(tag.opts.getProperty("alt", "").toCharArray())));
|
||||
}
|
||||
if (tag.name.equals("input") && (tag.opts.getProperty("type") != null && tag.opts.getProperty("type").equals("submit"))) {
|
||||
// rewrite button name
|
||||
tag.opts.setProperty("value", new String(transformText(tag.opts.getProperty("value", "").toCharArray())));
|
||||
}
|
||||
return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
|
||||
}
|
||||
|
||||
@Override
|
||||
public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
|
||||
if (bluelistHit(tag.opts.getProperty("href","").toCharArray())) return genBlueLetters(tag.content.length());
|
||||
if (bluelistHit(tag.content.getChars())) return genBlueLetters(tag.content.length());
|
||||
return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void close() {
|
||||
// free resources
|
||||
super.close();
|
||||
}
|
||||
|
||||
}
|
@ -1,59 +0,0 @@
|
||||
// Transformer.java
|
||||
// ---------------------------
|
||||
// (C) by Michael Peter Christen; mc@yacy.net
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.document.parser.html;
|
||||
|
||||
public interface Transformer {
|
||||
|
||||
// the init method is used to initialize the transformer with some values
|
||||
// i.e. the initarg - String can be the name of a file which may contain
|
||||
// more specific transformation rules
|
||||
public void init(String initarg);
|
||||
|
||||
// ask if this transformer will do any transformation whatsoever
|
||||
// this may return true if the initialization resulted in a status
|
||||
// that does not allow any transformation
|
||||
public boolean isIdentityTransformer();
|
||||
|
||||
// tests, if a given body-less tag (i.e. <br> shall be supervised)
|
||||
// only tags that are defined here will be cached and not streamed
|
||||
public boolean isTag0(String tag);
|
||||
|
||||
// tests if a given tag that may have a body (i.e. <tt> ..body.. </tt>)
|
||||
// shall be supervised
|
||||
public boolean isTag1(String tag);
|
||||
|
||||
// method that is called with any text between tags
|
||||
// the returned text replaces the given text
|
||||
// if the text shall not be changed, it must be returned as called
|
||||
public char[] transformText(char[] text);
|
||||
|
||||
// method that is called when a body-less tag occurs
|
||||
public char[] transformTag0(ContentScraper.Tag tag, char quotechar);
|
||||
|
||||
// method that is called when a body-containing text occurs
|
||||
public char[] transformTag1(ContentScraper.Tag tag, char quotechar);
|
||||
|
||||
public void close();
|
||||
}
|
Loading…
Reference in new issue