You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/document/parser/html/Scraper.java

95 lines
3.2 KiB

// Scraper.java
// ---------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser.html;
public interface Scraper {
/**
* @param tag
* a tag name
* @return true when the tag name belongs to the first category of tags
* according to the Scraper implementation, and is therefore candidate
* for processing by
* {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
* implementation
*/
public boolean isTag0(String tag);
/**
* @param tag
* a tag name
* @return true when the tag name belongs to the second category of tags
* according to the Scraper implementation, and is therefore candidate
* for processing by
* {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
* implementation
*/
public boolean isTag1(String tag);
/**
* Process plain text
* @param plain text to process
* @param insideTag the eventual direct parent tag. May be null.
*/
public void scrapeText(char[] text, ContentScraper.Tag insideTag);
/**
* Process a tag belonging to the first category of tags according to the Scraper implementation
* @param tag a parsed tag
*/
public void scrapeTag0(ContentScraper.Tag tag);
/**
* Process a tag belonging to the second category of tags according to the Scraper implementation
* @param tag a parsed tag
*/
public void scrapeTag1(ContentScraper.Tag tag);
/**
* Processing applied to any kind of tag opening.
* @param tag a parsed tag
*/
public void scrapeAnyTagOpening(ContentScraper.Tag tag);
/**
* @param tag
* a parsed tag
* @param parentTag the eventual parent tag
* @return true when the tag should be ignored according to the scraper
* implementation rules
*/
public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
public void scrapeComment(final char[] comment);
public void finish();
public void close();
public void registerHtmlFilterEventListener(ScraperListener listener);
public void deregisterHtmlFilterEventListener(ScraperListener listener);
}