added more attributes for html parser and enhanced data structures

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7679 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 0b02083e97
commit f6077b3cc0

@ -0,0 +1,3 @@
#!/bin/bash
cd "`dirname $0`"
./apicall.sh "/IndexControlRWIs_p.html?deleteIndex=off&deleteSolr=off&deleteCache=on&deleteCrawlQueues=off&deleteRobots=on&deleteSearchFl=on&deletecomplete=" > /dev/null

@ -0,0 +1,3 @@
#!/bin/bash
cd "`dirname $0`"
./apicall.sh "/IndexControlRWIs_p.html?deleteIndex=on&deleteSolr=on&deleteCache=off&deleteCrawlQueues=on&deleteRobots=on&deleteSearchFl=on&deletecomplete=" > /dev/null

@ -0,0 +1,3 @@
#!/bin/bash
cd "`dirname $0`"
./apicall.sh "/Crawler_p.html?bookmarkFolder=/crawlStart&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&crawlingIfOlderUnit=day&cachePolicy=ifexist&indexText=on&crawlingMode=file&crawlingURL=http://&bookmarkTitle=&mustnotmatch=&crawlingstart=import&mustmatch=.*&crawlingIfOlderNumber=7&repeat_unit=seldays&crawlingDepth=0&crawlingFile=$1" > /dev/null

@ -405,13 +405,20 @@
<!-- copy searchtest -->
<copy todir="${release_main}/bin">
<fileset dir="bin">
<include name="searchtest*"/>
<include name="localsearch.sh"/>
<include name="apicall.sh"/>
<include name="importmediawiki.sh"/>
<include name="clearall.sh"/>
<include name="up.sh"/>
<include name="clearcache.sh"/>
<include name="clearindex.sh"/>
<include name="down.sh"/>
<include name="importmediawiki.sh"/>
<include name="importOAIList.sh"/>
<include name="localsearch.sh"/>
<include name="searchtest*"/>
<include name="surrogateCleanOut.sh"/>
<include name="surrogateMVin.sh"/>
<include name="surrogateMVtmp.sh"/>
<include name="surrogateRefeed.sh"/>
<include name="up.sh"/>
</fileset>
</copy>

@ -112,7 +112,7 @@ public class WebStructurePicture_p {
// recursively find domains, up to a specific depth
final GraphPlotter graph = new GraphPlotter();
if (host != null) place(graph, sb.webStructure, hash, host, nodes, timeout, 0.0, 0.0, 0, depth);
if (host != null && hash != null) place(graph, sb.webStructure, hash, host, nodes, timeout, 0.0, 0.0, 0, depth);
//graph.print();
graphPicture = graph.draw(width, height, 40, 40, 16, 16, color_back, color_dot, color_line, color_lineend, color_text);

@ -98,6 +98,7 @@ import net.yacy.document.TextParser;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.parser.html.Evaluation;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -485,7 +486,7 @@ public final class Switchboard extends serverSwitch {
//starting blog
initBlog();
// Init User DB
// init User DB
this.log.logConfig("Loading User DB");
final File userDbFile = new File(getDataPath(), "DATA/SETTINGS/user.heap");
this.userDB = new UserDB(userDbFile);
@ -493,7 +494,19 @@ public final class Switchboard extends serverSwitch {
", " + this.userDB.size() + " entries" +
", " + ppRamString(userDbFile.length()/1024));
// Init bookmarks DB: needs more time since this does a DNS lookup for each Bookmark.
// init html parser evaluation scheme
File parserPropertiesPath = new File("defaults/");
String[] settingsList = parserPropertiesPath.list();
for (String l: settingsList) {
if (l.startsWith("parser.") && l.endsWith(".properties")) Evaluation.add(new File(parserPropertiesPath, l));
}
parserPropertiesPath = new File(getDataPath(), "DATA/SETTINGS/");
settingsList = parserPropertiesPath.list();
for (String l: settingsList) {
if (l.startsWith("parser.") && l.endsWith(".properties")) Evaluation.add(new File(parserPropertiesPath, l));
}
// init bookmarks DB: needs more time since this does a DNS lookup for each Bookmark.
// Can be started concurrently
new Thread(){
@Override

@ -61,29 +61,6 @@ public enum SolrScheme {
InetAddress address = Domains.dnsResolve(digestURI.getHost());
if (address != null) solrdoc.addField("attr_ip", address.getHostAddress());
if (digestURI.getHost() != null) solrdoc.addField("attr_host", digestURI.getHost());
/*
private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field
private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result
private final StringBuilder creator; // author or copyright
private final String publisher; // publisher
private final List<String> sections; // if present: more titles/headlines appearing in the document
private final StringBuilder description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private final Map<MultiProtocolURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final Map<MultiProtocolURI, String> rss; // all embedded rss feeds
private final Map<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
private Map<MultiProtocolURI, String> hyperlinks, audiolinks, videolinks, applinks;
private Map<String, String> emaillinks;
private MultiProtocolURI favicon;
private boolean resorted;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
private Set<String> languages;
private boolean indexingDenied;
*/
solrdoc.addField("title", yacydoc.dc_title());
solrdoc.addField("author", yacydoc.dc_creator());
solrdoc.addField("description", yacydoc.dc_description());
@ -166,9 +143,17 @@ public enum SolrScheme {
// bold, italic
String[] bold = html.getBold();
if (bold.length > 0) solrdoc.addField("attr_bold", bold);
solrdoc.addField("boldcount_i", bold.length);
if (bold.length > 0) {
solrdoc.addField("attr_bold", bold);
solrdoc.addField("attr_boldcount", html.getBoldCount(bold));
}
String[] italic = html.getItalic();
if (bold.length > 0) solrdoc.addField("attr_italic", italic);
solrdoc.addField("italiccount_i", italic.length);
if (italic.length > 0) {
solrdoc.addField("attr_italic", italic);
solrdoc.addField("attr_italiccount", html.getItalicCount(italic));
}
String[] li = html.getLi();
solrdoc.addField("licount_i", li.length);
if (li.length > 0) solrdoc.addField("attr_li", li);
@ -225,6 +210,15 @@ public enum SolrScheme {
// flash embedded
solrdoc.addField("flash_b", html.containsFlash());
// generic evaluation pattern
for (String model: html.getEvaluationModelNames()) {
String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) {
solrdoc.addField("attr_" + model, scorenames);
solrdoc.addField("attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
}
}
}
return solrdoc;
}

@ -0,0 +1,39 @@
/**
* AbstractScoreMap
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 28.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.storage;
public abstract class AbstractScoreMap<E> implements ScoreMap<E> {
/**
* apply all E/int mappings from an external ScoreMap to this ScoreMap
*/
public void inc(ScoreMap<E> map) {
if (map == null) return;
for (E entry: map) {
this.inc(entry, map.get(entry));
}
}
}

@ -35,7 +35,7 @@ import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
public final class ClusteredScoreMap<E> implements ReversibleScoreMap<E> {
public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements ReversibleScoreMap<E> {
protected final Map<E, Long> map; // a mapping from a reference to the cluster key
protected final TreeMap<Long, E> pam; // a mapping from the cluster key to the reference
@ -48,6 +48,10 @@ public final class ClusteredScoreMap<E> implements ReversibleScoreMap<E> {
gcount = 0;
encnt = 0;
}
public Iterator<E> iterator() {
return map.keySet().iterator();
}
public synchronized void clear() {
map.clear();

@ -35,7 +35,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
public class ConcurrentScoreMap<E> implements ScoreMap<E> {
public class ConcurrentScoreMap<E> extends AbstractScoreMap<E> implements ScoreMap<E> {
protected final ConcurrentHashMap<E, AtomicLong> map; // a mapping from a reference to the cluster key
private long gcount;
@ -44,6 +44,10 @@ public class ConcurrentScoreMap<E> implements ScoreMap<E> {
map = new ConcurrentHashMap<E, AtomicLong>();
gcount = 0;
}
public Iterator<E> iterator() {
return map.keySet().iterator();
}
public synchronized void clear() {
map.clear();

@ -38,7 +38,7 @@ import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
public class OrderedScoreMap<E> implements ScoreMap<E> {
public class OrderedScoreMap<E> extends AbstractScoreMap<E> implements ScoreMap<E> {
protected final Map<E, AtomicInteger> map; // a mapping from a reference to the cluster key
@ -49,6 +49,10 @@ public class OrderedScoreMap<E> implements ScoreMap<E> {
map = new TreeMap<E, AtomicInteger>(comparator);
}
}
public Iterator<E> iterator() {
return map.keySet().iterator();
}
public synchronized void clear() {
map.clear();

@ -26,7 +26,7 @@ package net.yacy.cora.storage;
import java.util.Iterator;
public interface ScoreMap<E> {
public interface ScoreMap<E> extends Iterable<E> {
public void clear();
@ -65,4 +65,5 @@ public interface ScoreMap<E> {
public void dec(final E obj);
public void dec(final E obj, final int incrementScore);
public void inc(ScoreMap<E> map);
}

@ -402,7 +402,10 @@ dc_rights
for (Map.Entry<MultiProtocolURI, Properties> entry: anchors.entrySet()) {
url = entry.getKey();
if (url == null) continue;
if (url.getHost() != null && thishost != null && url.getHost().equals(thishost)) this.inboundlinks.put(url, "anchor"); else this.outboundlinks.put(url, "anchor");
if (url.getHost() != null && thishost != null &&
url.getHost().endsWith(thishost) ||
(thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4)))
) this.inboundlinks.put(url, "anchor"); else this.outboundlinks.put(url, "anchor");
u = url.toNormalform(true, false);
String name = entry.getValue().getProperty("name", "");
if (u.startsWith("mailto:")) {

@ -44,8 +44,10 @@ import java.util.regex.Pattern;
import javax.swing.event.EventListenerList;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.storage.ClusteredScoreMap;
import net.yacy.document.SentenceReader;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.Evaluation.Element;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -60,33 +62,51 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// statics: for initialization of the HTMLFilterAbstractScraper
private static final Set<String> linkTags0 = new HashSet<String>(9,0.99f);
private static final Set<String> linkTags1 = new HashSet<String>(7,0.99f);
public enum TagType {
singleton, pair;
}
public enum Tag {
html(TagType.singleton), // scraped as singleton to get attached properties like 'lang'
body(TagType.singleton), // scraped as singleton to get attached properties like 'class'
div(TagType.singleton), // scraped as singleton to get attached properties like 'id'
img(TagType.singleton),
base(TagType.singleton),
frame(TagType.singleton),
meta(TagType.singleton),
area(TagType.singleton),
link(TagType.singleton),
embed(TagType.singleton), //added by [MN]
param(TagType.singleton), //added by [MN]
a(TagType.pair),
h1(TagType.pair),
h2(TagType.pair),
h3(TagType.pair),
h4(TagType.pair),
h5(TagType.pair),
h6(TagType.pair),
title(TagType.pair),
b(TagType.pair),
strong(TagType.pair),
i(TagType.pair),
li(TagType.pair),
iframe(TagType.pair),
script(TagType.pair);
public TagType type;
private Tag(TagType type) {
this.type = type;
}
}
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase
static {
linkTags0.add("html"); // scraped as tag 0 to get attached properties like 'lang'
linkTags0.add("img");
linkTags0.add("base");
linkTags0.add("frame");
linkTags0.add("meta");
linkTags0.add("area");
linkTags0.add("link");
linkTags0.add("script");
linkTags0.add("embed"); //added by [MN]
linkTags0.add("param"); //added by [MN]
linkTags1.add("a");
linkTags1.add("h1");
linkTags1.add("h2");
linkTags1.add("h3");
linkTags1.add("h4");
linkTags1.add("h5");
linkTags1.add("h6");
linkTags1.add("title");
linkTags1.add("b");
linkTags1.add("strong");
linkTags1.add("i");
linkTags1.add("li");
linkTags1.add("iframe");
for (Tag tag: Tag.values()) {
if (tag.type == TagType.singleton) linkTags0.add(tag.name());
if (tag.type == TagType.pair) linkTags1.add(tag.name());
}
//<iframe src="../../../index.htm" name="SELFHTML_in_a_box" width="90%" height="400">
}
@ -99,7 +119,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private String title;
//private String headline;
private List<String>[] headlines;
private List<String> bold, italic, li;
private ClusteredScoreMap<String> bold, italic;
private List<String> li;
private CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private float lon, lat;
@ -113,6 +134,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* The document root {@link MultiProtocolURI}
*/
private MultiProtocolURI root;
/**
* evaluation scores: count appearance of specific attributes
*/
private Evaluation.Scores evaluationScores;
@SuppressWarnings("unchecked")
public ContentScraper(final MultiProtocolURI root) {
@ -120,6 +146,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// it is only the reference for relative links
super(linkTags0, linkTags1);
this.root = root;
this.evaluationScores = new Evaluation.Scores();
this.rss = new HashMap<MultiProtocolURI, String>();
this.css = new HashMap<MultiProtocolURI, String>();
this.anchors = new HashMap<MultiProtocolURI, Properties>();
@ -131,19 +158,23 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.title = "";
this.headlines = new ArrayList[6];
for (int i = 0; i < this.headlines.length; i++) headlines[i] = new ArrayList<String>();
this.bold = new ArrayList<String>();
this.italic = new ArrayList<String>();
this.bold = new ClusteredScoreMap<String>();
this.italic = new ClusteredScoreMap<String>();
this.li = new ArrayList<String>();
this.content = new CharBuffer(1024);
this.htmlFilterEventListeners = new EventListenerList();
this.lon = 0.0f;
this.lat = 0.0f;
Evaluation.match(Element.url, root.toNormalform(false, false), this.evaluationScores);
}
public void scrapeText(final char[] newtext, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext));
int p, pl, q, s = 0;
// match evaluation pattern
Evaluation.match(Element.text, newtext, this.evaluationScores);
// try to find location information in text
// Opencaching:
// <nobr>N 50o 05.453&#039;</nobr><nobr>E 008o 30.191&#039;</nobr>
@ -246,11 +277,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
try {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
//if (width > 15 && height > 15) {
final MultiProtocolURI url = absolutePath(tagopts.getProperty("src", ""));
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
addImage(images, ie);
//}
String src = tagopts.getProperty("src", "");
if (src.length() > 0) {
final MultiProtocolURI url = absolutePath(src);
if (url != null) {
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
addImage(images, ie);
}
}
} catch (final NumberFormatException e) {}
} else if(tagname.equalsIgnoreCase("base")) {
try {
@ -262,16 +296,24 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if (tagname.equalsIgnoreCase("iframe")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
iframes.add(absolutePath(tagopts.getProperty("src", "")));
} else if (tagname.equalsIgnoreCase("script")) {
script.add(absolutePath(tagopts.getProperty("src", "")));
} else if (tagname.equalsIgnoreCase("body")) {
String c = tagopts.getProperty("class", "");
Evaluation.match(Element.bodyclass, c, this.evaluationScores);
} else if (tagname.equalsIgnoreCase("div")) {
String id = tagopts.getProperty("id", "");
Evaluation.match(Element.divid, id, this.evaluationScores);
} else if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", "");
String content = tagopts.getProperty("content","");
if (name.length() > 0) {
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content","")));
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
if (name.equals("generator")) {
Evaluation.match(Element.metagenerator, content, this.evaluationScores);
}
} else {
name = tagopts.getProperty("http-equiv", "");
if (name.length() > 0) {
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content","")));
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
}
}
} else if (tagname.equalsIgnoreCase("area")) {
@ -281,7 +323,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
Properties p = new Properties(); p.put("name", areatitle);
if (href.length() > 0) anchors.put(absolutePath(href), p);
} else if (tagname.equalsIgnoreCase("link")) {
final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", ""));
String href = tagopts.getProperty("href", "");
final MultiProtocolURI newLink = absolutePath(href);
if (newLink != null) {
final String rel = tagopts.getProperty("rel", "");
@ -296,6 +339,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
rss.put(newLink, linktitle);
} else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
css.put(newLink, rel);
Evaluation.match(Element.csspath, href, this.evaluationScores);
} else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
Properties p = new Properties(); p.put("name", linktitle);
anchors.put(newLink, p);
@ -356,21 +400,34 @@ public class ContentScraper extends AbstractScraper implements Scraper {
title = recursiveParse(text);
} else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) bold.add(h);
if (h.length() > 0) bold.inc(h);
} else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) bold.add(h);
if (h.length() > 0) bold.inc(h);
} else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) italic.add(h);
if (h.length() > 0) italic.inc(h);
} else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) li.add(h);
} else if (tagname.equalsIgnoreCase("script")) {
String src = tagopts.getProperty("src", "");
if (src.length() > 0) {
script.add(absolutePath(src));
Evaluation.match(Element.scriptpath, src, this.evaluationScores);
} else {
Evaluation.match(Element.scriptcode, text, this.evaluationScores);
}
}
// fire event
fireScrapeTag1(tagname, tagopts, text);
}
public void scrapeComment(final char[] comment) {
Evaluation.match(Element.comment, comment, this.evaluationScores);
}
private String recursiveParse(final char[] inlineHtml) {
if (inlineHtml.length < 14) return cleanLine(super.stripAll(inlineHtml));
@ -446,11 +503,29 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
public String[] getBold() {
return this.bold.toArray(new String[this.bold.size()]);
List<String> a = new ArrayList<String>();
Iterator<String> i = this.bold.keys(false);
while (i.hasNext()) a.add(i.next());
return a.toArray(new String[a.size()]);
}
public String[] getBoldCount(String[] a) {
String[] counter = new String[a.length];
for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(this.bold.get(a[i]));
return counter;
}
public String[] getItalic() {
return this.italic.toArray(new String[this.italic.size()]);
List<String> a = new ArrayList<String>();
Iterator<String> i = this.italic.keys(false);
while (i.hasNext()) a.add(i.next());
return a.toArray(new String[a.size()]);
}
public String[] getItalicCount(String[] a) {
String[] counter = new String[a.length];
for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(this.italic.get(a[i]));
return counter;
}
public String[] getLi() {
@ -663,6 +738,33 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.lat;
}
/**
* produce all model names
* @return a set of model names
*/
public Set<String> getEvaluationModelNames() {
return this.evaluationScores.getModelNames();
}
public String[] getEvaluationModelScoreNames(String modelName) {
List<String> a = new ArrayList<String>();
ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
if (scores != null) {
Iterator<String> i = scores.keys(false);
while (i.hasNext()) a.add(i.next());
}
return a.toArray(new String[a.size()]);
}
public String[] getEvaluationModelScoreCounts(String modelName, String[] a) {
ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
String[] counter = new String[a.length];
if (scores != null) {
for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(scores.get(a[i]));
}
return counter;
}
/*
* (non-Javadoc)
* @see de.anomic.htmlFilter.htmlFilterScraper#close()

@ -0,0 +1,196 @@
package net.yacy.document.parser.html;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.storage.ClusteredScoreMap;
import net.yacy.kelondro.logging.Log;
/*
* This class provides methods to use a pattern analysis for html files
* The pattern analysis is generic and can be configured using a field-name/pattern property
* configuration file.
* Such a configuration file has names of the structure
* <subject-name>_<document-element>
* and values are regular java expressions
* A html file is scanned for pattern matchings within a specific <document-element>
* and if such a matching can be found then the <attribute-name> is collected as
* subject for the scanned document
* patternProperties files must have special file names where the file name
* starts with the word "parser." and must end with ".properties"
* everything between this is a name for a solr multi-value field where
* the collected subject names are stored to
*/
public class Evaluation {
private static List<Model> models = new ArrayList<Model>(); // the list of all models that shall be applied
public static enum Element {
text,
bodyclass,
divid,
csspath,
metagenerator,
url,
scriptpath,
scriptcode,
comment;
}
private static class Attribute {
public String subject; // the name of the attribute
public Pattern pattern; // the pattern that must match for that attribute
public Attribute(String subject, Pattern pattern) {
this.subject = subject;
this.pattern = pattern;
}
}
private static class Model {
private String modelName;
private Map<Element, List<Attribute>> elementMatcher; // a mapping from element-names to lists of Attributes
public Model(File patternProperties) throws IOException {
if (!patternProperties.exists()) throw new IOException("File does not exist: " + patternProperties);
String name = patternProperties.getName();
if (!name.startsWith("parser.")) throw new IOException("file name must start with 'parser.': " + name);
if (!name.endsWith(".properties")) throw new IOException("file name must end with '.properties': " + name);
this.modelName = name.substring(7, name.length() - 11);
if (this.modelName.length() < 1) throw new IOException("file name too short: " + name);
// load the file
Properties p = new Properties();
p.load(new FileReader(patternProperties));
// iterate through the properties and generate method patterns
elementMatcher = new HashMap<Element, List<Attribute>>();
String subject, elementName;
Element element;
Pattern pattern;
for (Map.Entry<Object, Object> entry: p.entrySet()) {
String k = (String) entry.getKey();
String v = (String) entry.getValue();
int w = k.indexOf('_');
if (w < 0) {
Log.logSevere("PatternAnalysis", "wrong configuration in " + name + ": separator '_' missing: " + k);
continue;
}
subject = k.substring(0, w);
elementName = k.substring(w + 1);
try {
pattern = Pattern.compile(v);
} catch (PatternSyntaxException e) {
Log.logSevere("PatternAnalysis", "bad pattern in " + name + ": '" + k + "=" + v + "' - " + e.getDescription());
continue;
}
element = Element.valueOf(elementName);
if (element == null) {
Log.logSevere("PatternAnalysis", "unknown element in " + name + ": " + elementName);
continue;
}
List<Attribute> attributeList = this.elementMatcher.get(element);
if (attributeList == null) {
attributeList = new ArrayList<Attribute>();
this.elementMatcher.put(element, attributeList);
}
attributeList.add(new Attribute(subject, pattern));
}
}
public String getName() {
return this.modelName;
}
/**
* match elementContents for a specific elementName
* @param element - the name of the element as Element enum type
* @param content - the content of the element
* @return a list of subject names that match with the element
*/
public ClusteredScoreMap<String> match(Element element, String content) {
ClusteredScoreMap<String> subjects = new ClusteredScoreMap<String>();
List<Attribute> patterns = this.elementMatcher.get(element);
if (patterns == null) return subjects;
for (Attribute attribute: patterns) {
if (attribute.pattern.matcher(content).matches()) subjects.inc(attribute.subject);
}
return subjects;
}
}
public static class Scores {
private Map<String, ClusteredScoreMap<String>> modelMap; // a map from model names to attribute scores
public Scores() {
this.modelMap = new HashMap<String, ClusteredScoreMap<String>>();
}
/**
* produce all model names
* @return a set of model names
*/
public Set<String> getModelNames() {
return this.modelMap.keySet();
}
/**
* calculate the scores for a model
* the scores is a attribute/count map which count how often a specific attribute was found
* @param modelName
* @return
*/
public ClusteredScoreMap<String> getScores(String modelName) {
return this.modelMap.get(modelName);
}
}
/**
* add a model to the evaluation set
* @param f
* @throws IOException
*/
public static void add(File f) throws IOException {
Model pattern = new Model(f);
models.add(pattern);
}
/**
* match some content within a specific element
* this will increase statistic counters for models if a model matches
* @param element - the element where a matching is made
* @param content - the content of the element which shall be matched
* @param scores - the score object where the scores are stored
*/
public static void match(Element element, String content, Scores scores) {
if (models.isEmpty()) return; // fast return if this feature is not used
ClusteredScoreMap<String> newScores, oldScores;
for (Model pattern: models) {
newScores = pattern.match(element, content);
oldScores = scores.getScores(pattern.getName());
if (oldScores == null) {
oldScores = new ClusteredScoreMap<String>();
scores.modelMap.put(pattern.getName(), oldScores);
}
oldScores.inc(newScores);
}
}
public static void match(Element element, char[] content, Scores scores) {
if (models.isEmpty()) return; // fast return if this feature is not used
match(element, new String(content), scores);
}
}

@ -36,6 +36,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
private final long fileSize;
public ImageEntry(final MultiProtocolURI url, final String alt, final int width, final int height, long fileSize) {
assert url != null;
this.url = url;
this.alt = alt;
this.width = width;

@ -37,6 +37,8 @@ public interface Scraper {
public void scrapeTag0(String tagname, Properties tagopts);
public void scrapeTag1(String tagname, Properties tagopts, char[] text);
public void scrapeComment(final char[] comment);
public void close();

@ -395,7 +395,9 @@ public final class TransformerWriter extends Writer {
buffer.charAt(buffer.length() - 3) == dash) {
// comment is at end
inComment = false;
if (out != null) out.write(buffer.getChars());
char[] comment = buffer.getChars();
if (scraper != null) scraper.scrapeComment(comment);
if (out != null) out.write(comment);
// buffer = new serverByteBuffer();
buffer.reset();
}

Loading…
Cancel
Save