You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
101 lines
3.4 KiB
101 lines
3.4 KiB
16 years ago
|
// AbstractScraper.java
|
||
18 years ago
|
// ---------------------------
|
||
17 years ago
|
// (C) by Michael Peter Christen; mc@yacy.net
|
||
18 years ago
|
// first published on http://www.anomic.de
|
||
|
// Frankfurt, Germany, 2004
|
||
16 years ago
|
//
|
||
|
// $LastChangedDate$
|
||
|
// $LastChangedRevision$
|
||
|
// $LastChangedBy$
|
||
18 years ago
|
//
|
||
|
// You agree that the Author(s) is (are) not responsible for cost,
|
||
|
// loss of data or any harm that may be caused by usage of this softare or
|
||
|
// this documentation. The usage of this software is on your own risk. The
|
||
|
// installation and usage (starting/running) of this software may allow other
|
||
|
// people or application to access your computer and any attached devices and
|
||
|
// is highly dependent on the configuration of the software which must be
|
||
|
// done by the user of the software;the author(s) is (are) also
|
||
|
// not responsible for proper configuration and usage of the software, even
|
||
|
// if provoked by documentation provided together with the software.
|
||
|
//
|
||
|
// THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION
|
||
|
// IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS
|
||
|
// FILE AND AS IN http://www.gnu.org/licenses/gpl.txt
|
||
|
// ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE
|
||
|
// LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT
|
||
|
// BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION
|
||
|
// MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE.
|
||
|
// CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH.
|
||
|
|
||
15 years ago
|
package net.yacy.document.parser.html;
|
||
18 years ago
|
|
||
17 years ago
|
import java.util.HashSet;
|
||
18 years ago
|
import java.util.Properties;
|
||
|
|
||
16 years ago
|
public abstract class AbstractScraper implements Scraper {
|
||
18 years ago
|
|
||
|
public static final char lb = '<';
|
||
|
public static final char rb = '>';
|
||
|
public static final char sl = '/';
|
||
|
|
||
17 years ago
|
private HashSet<String> tags0;
|
||
|
private HashSet<String> tags1;
|
||
18 years ago
|
|
||
17 years ago
|
/**
|
||
|
* create a scraper. the tag sets must contain tags in lowercase!
|
||
|
* @param tags0
|
||
|
* @param tags1
|
||
|
*/
|
||
16 years ago
|
public AbstractScraper(final HashSet<String> tags0, final HashSet<String> tags1) {
|
||
18 years ago
|
this.tags0 = tags0;
|
||
|
this.tags1 = tags1;
|
||
|
}
|
||
|
|
||
17 years ago
|
public boolean isTag0(final String tag) {
|
||
17 years ago
|
return (tags0 != null) && (tags0.contains(tag.toLowerCase()));
|
||
18 years ago
|
}
|
||
|
|
||
17 years ago
|
public boolean isTag1(final String tag) {
|
||
17 years ago
|
return (tags1 != null) && (tags1.contains(tag.toLowerCase()));
|
||
18 years ago
|
}
|
||
|
|
||
|
//the 'missing' method that shall be implemented:
|
||
18 years ago
|
public abstract void scrapeText(char[] text, String insideTag);
|
||
18 years ago
|
|
||
|
// the other methods must take into account to construct the return value correctly
|
||
|
public abstract void scrapeTag0(String tagname, Properties tagopts);
|
||
|
|
||
|
public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
|
||
|
|
||
16 years ago
|
protected static String stripAllTags(String s) {
|
||
16 years ago
|
StringBuilder r = new StringBuilder(s.length());
|
||
16 years ago
|
int bc = 0;
|
||
|
char c;
|
||
|
for (int p = 0; p < s.length(); p++) {
|
||
|
c = s.charAt(p);
|
||
|
if (c == lb) {
|
||
|
bc++;
|
||
|
r.append(' ');
|
||
|
} else if (c == rb) {
|
||
16 years ago
|
bc--;
|
||
16 years ago
|
} else if (bc <= 0) {
|
||
|
r.append(c);
|
||
18 years ago
|
}
|
||
|
}
|
||
16 years ago
|
return r.toString().trim();
|
||
18 years ago
|
}
|
||
|
|
||
16 years ago
|
public static String stripAll(String s) {
|
||
16 years ago
|
return CharacterCoding.html2unicode(stripAllTags(s));
|
||
18 years ago
|
}
|
||
|
|
||
|
public void close() {
|
||
|
// free resources
|
||
|
tags0 = null;
|
||
|
tags1 = null;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
|