From f6077b3cc0ad9b2c7327259e8e771cb939a1caf0 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 28 Apr 2011 13:09:01 +0000 Subject: [PATCH] added more attributes for html parser and enhanced data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7679 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- bin/clearcache.sh | 3 + bin/clearindex.sh | 3 + bin/importurllist.sh | 3 + build.xml | 15 +- htroot/WebStructurePicture_p.java | 2 +- source/de/anomic/search/Switchboard.java | 17 +- .../services/federated/solr/SolrScheme.java | 44 ++-- .../yacy/cora/storage/AbstractScoreMap.java | 39 ++++ .../yacy/cora/storage/ClusteredScoreMap.java | 6 +- .../yacy/cora/storage/ConcurrentScoreMap.java | 6 +- .../yacy/cora/storage/OrderedScoreMap.java | 6 +- source/net/yacy/cora/storage/ScoreMap.java | 3 +- source/net/yacy/document/Document.java | 5 +- .../document/parser/html/ContentScraper.java | 188 +++++++++++++---- .../yacy/document/parser/html/Evaluation.java | 196 ++++++++++++++++++ .../yacy/document/parser/html/ImageEntry.java | 1 + .../yacy/document/parser/html/Scraper.java | 2 + .../parser/html/TransformerWriter.java | 4 +- 18 files changed, 462 insertions(+), 81 deletions(-) create mode 100755 bin/clearcache.sh create mode 100755 bin/clearindex.sh create mode 100755 bin/importurllist.sh create mode 100644 source/net/yacy/cora/storage/AbstractScoreMap.java create mode 100644 source/net/yacy/document/parser/html/Evaluation.java diff --git a/bin/clearcache.sh b/bin/clearcache.sh new file mode 100755 index 000000000..209be29c5 --- /dev/null +++ b/bin/clearcache.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cd "`dirname $0`" +./apicall.sh "/IndexControlRWIs_p.html?deleteIndex=off&deleteSolr=off&deleteCache=on&deleteCrawlQueues=off&deleteRobots=on&deleteSearchFl=on&deletecomplete=" > /dev/null diff --git a/bin/clearindex.sh b/bin/clearindex.sh new file mode 100755 index 000000000..535371981 --- /dev/null +++ b/bin/clearindex.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cd "`dirname $0`" +./apicall.sh "/IndexControlRWIs_p.html?deleteIndex=on&deleteSolr=on&deleteCache=off&deleteCrawlQueues=on&deleteRobots=on&deleteSearchFl=on&deletecomplete=" > /dev/null diff --git a/bin/importurllist.sh b/bin/importurllist.sh new file mode 100755 index 000000000..ce943f7e4 --- /dev/null +++ b/bin/importurllist.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cd "`dirname $0`" +./apicall.sh "/Crawler_p.html?bookmarkFolder=/crawlStart&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&crawlingIfOlderUnit=day&cachePolicy=ifexist&indexText=on&crawlingMode=file&crawlingURL=http://&bookmarkTitle=&mustnotmatch=&crawlingstart=import&mustmatch=.*&crawlingIfOlderNumber=7&repeat_unit=seldays&crawlingDepth=0&crawlingFile=$1" > /dev/null diff --git a/build.xml b/build.xml index 46c690039..14955a82b 100644 --- a/build.xml +++ b/build.xml @@ -405,13 +405,20 @@ - - - - + + + + + + + + + + + diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java index 89fe5dc4f..310a203d6 100644 --- a/htroot/WebStructurePicture_p.java +++ b/htroot/WebStructurePicture_p.java @@ -112,7 +112,7 @@ public class WebStructurePicture_p { // recursively find domains, up to a specific depth final GraphPlotter graph = new GraphPlotter(); - if (host != null) place(graph, sb.webStructure, hash, host, nodes, timeout, 0.0, 0.0, 0, depth); + if (host != null && hash != null) place(graph, sb.webStructure, hash, host, nodes, timeout, 0.0, 0.0, 0, depth); //graph.print(); graphPicture = graph.draw(width, height, 40, 40, 16, 16, color_back, color_dot, color_line, color_lineend, color_text); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index aff6fc76b..3b52f113a 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -98,6 +98,7 @@ import net.yacy.document.TextParser; import net.yacy.document.content.DCEntry; import net.yacy.document.content.SurrogateReader; import net.yacy.document.importer.OAIListFriendsLoader; +import net.yacy.document.parser.html.Evaluation; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -485,7 +486,7 @@ public final class Switchboard extends serverSwitch { //starting blog initBlog(); - // Init User DB + // init User DB this.log.logConfig("Loading User DB"); final File userDbFile = new File(getDataPath(), "DATA/SETTINGS/user.heap"); this.userDB = new UserDB(userDbFile); @@ -493,7 +494,19 @@ public final class Switchboard extends serverSwitch { ", " + this.userDB.size() + " entries" + ", " + ppRamString(userDbFile.length()/1024)); - // Init bookmarks DB: needs more time since this does a DNS lookup for each Bookmark. + // init html parser evaluation scheme + File parserPropertiesPath = new File("defaults/"); + String[] settingsList = parserPropertiesPath.list(); + for (String l: settingsList) { + if (l.startsWith("parser.") && l.endsWith(".properties")) Evaluation.add(new File(parserPropertiesPath, l)); + } + parserPropertiesPath = new File(getDataPath(), "DATA/SETTINGS/"); + settingsList = parserPropertiesPath.list(); + for (String l: settingsList) { + if (l.startsWith("parser.") && l.endsWith(".properties")) Evaluation.add(new File(parserPropertiesPath, l)); + } + + // init bookmarks DB: needs more time since this does a DNS lookup for each Bookmark. // Can be started concurrently new Thread(){ @Override diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java index 32a891878..b080849db 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java @@ -61,29 +61,6 @@ public enum SolrScheme { InetAddress address = Domains.dnsResolve(digestURI.getHost()); if (address != null) solrdoc.addField("attr_ip", address.getHostAddress()); if (digestURI.getHost() != null) solrdoc.addField("attr_host", digestURI.getHost()); - /* - private final String charset; // the charset of the document - private final List keywords; // most resources provide a keyword field - private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result - private final StringBuilder creator; // author or copyright - private final String publisher; // publisher - private final List sections; // if present: more titles/headlines appearing in the document - private final StringBuilder description; // an abstract, if present: short content description - private Object text; // the clear text, all that is visible - private final Map anchors; // all links embedded as clickeable entities (anchor tags) - private final Map rss; // all embedded rss feeds - private final Map images; // all visible pictures in document - // the anchors and images - Maps are URL-to-EntityDescription mappings. - // The EntityDescription appear either as visible text in anchors or as alternative - // text in image tags. - private Map hyperlinks, audiolinks, videolinks, applinks; - private Map emaillinks; - private MultiProtocolURI favicon; - private boolean resorted; - private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure - private Set languages; - private boolean indexingDenied; - */ solrdoc.addField("title", yacydoc.dc_title()); solrdoc.addField("author", yacydoc.dc_creator()); solrdoc.addField("description", yacydoc.dc_description()); @@ -166,9 +143,17 @@ public enum SolrScheme { // bold, italic String[] bold = html.getBold(); - if (bold.length > 0) solrdoc.addField("attr_bold", bold); + solrdoc.addField("boldcount_i", bold.length); + if (bold.length > 0) { + solrdoc.addField("attr_bold", bold); + solrdoc.addField("attr_boldcount", html.getBoldCount(bold)); + } String[] italic = html.getItalic(); - if (bold.length > 0) solrdoc.addField("attr_italic", italic); + solrdoc.addField("italiccount_i", italic.length); + if (italic.length > 0) { + solrdoc.addField("attr_italic", italic); + solrdoc.addField("attr_italiccount", html.getItalicCount(italic)); + } String[] li = html.getLi(); solrdoc.addField("licount_i", li.length); if (li.length > 0) solrdoc.addField("attr_li", li); @@ -225,6 +210,15 @@ public enum SolrScheme { // flash embedded solrdoc.addField("flash_b", html.containsFlash()); + + // generic evaluation pattern + for (String model: html.getEvaluationModelNames()) { + String[] scorenames = html.getEvaluationModelScoreNames(model); + if (scorenames.length > 0) { + solrdoc.addField("attr_" + model, scorenames); + solrdoc.addField("attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames)); + } + } } return solrdoc; } diff --git a/source/net/yacy/cora/storage/AbstractScoreMap.java b/source/net/yacy/cora/storage/AbstractScoreMap.java new file mode 100644 index 000000000..59339b1ce --- /dev/null +++ b/source/net/yacy/cora/storage/AbstractScoreMap.java @@ -0,0 +1,39 @@ +/** + * AbstractScoreMap + * Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 28.04.2011 at http://yacy.net + * + * $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $ + * $LastChangedRevision: 7653 $ + * $LastChangedBy: orbiter $ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.storage; + +public abstract class AbstractScoreMap implements ScoreMap { + + /** + * apply all E/int mappings from an external ScoreMap to this ScoreMap + */ + public void inc(ScoreMap map) { + if (map == null) return; + for (E entry: map) { + this.inc(entry, map.get(entry)); + } + } + +} diff --git a/source/net/yacy/cora/storage/ClusteredScoreMap.java b/source/net/yacy/cora/storage/ClusteredScoreMap.java index b066013db..148be9b1c 100644 --- a/source/net/yacy/cora/storage/ClusteredScoreMap.java +++ b/source/net/yacy/cora/storage/ClusteredScoreMap.java @@ -35,7 +35,7 @@ import java.util.TreeMap; import net.yacy.cora.document.UTF8; -public final class ClusteredScoreMap implements ReversibleScoreMap { +public final class ClusteredScoreMap extends AbstractScoreMap implements ReversibleScoreMap { protected final Map map; // a mapping from a reference to the cluster key protected final TreeMap pam; // a mapping from the cluster key to the reference @@ -48,6 +48,10 @@ public final class ClusteredScoreMap implements ReversibleScoreMap { gcount = 0; encnt = 0; } + + public Iterator iterator() { + return map.keySet().iterator(); + } public synchronized void clear() { map.clear(); diff --git a/source/net/yacy/cora/storage/ConcurrentScoreMap.java b/source/net/yacy/cora/storage/ConcurrentScoreMap.java index 3cdcebb44..973817f80 100644 --- a/source/net/yacy/cora/storage/ConcurrentScoreMap.java +++ b/source/net/yacy/cora/storage/ConcurrentScoreMap.java @@ -35,7 +35,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; -public class ConcurrentScoreMap implements ScoreMap { +public class ConcurrentScoreMap extends AbstractScoreMap implements ScoreMap { protected final ConcurrentHashMap map; // a mapping from a reference to the cluster key private long gcount; @@ -44,6 +44,10 @@ public class ConcurrentScoreMap implements ScoreMap { map = new ConcurrentHashMap(); gcount = 0; } + + public Iterator iterator() { + return map.keySet().iterator(); + } public synchronized void clear() { map.clear(); diff --git a/source/net/yacy/cora/storage/OrderedScoreMap.java b/source/net/yacy/cora/storage/OrderedScoreMap.java index 50d7d4ed1..5ecf3e5e9 100644 --- a/source/net/yacy/cora/storage/OrderedScoreMap.java +++ b/source/net/yacy/cora/storage/OrderedScoreMap.java @@ -38,7 +38,7 @@ import java.util.TreeSet; import java.util.concurrent.atomic.AtomicInteger; -public class OrderedScoreMap implements ScoreMap { +public class OrderedScoreMap extends AbstractScoreMap implements ScoreMap { protected final Map map; // a mapping from a reference to the cluster key @@ -49,6 +49,10 @@ public class OrderedScoreMap implements ScoreMap { map = new TreeMap(comparator); } } + + public Iterator iterator() { + return map.keySet().iterator(); + } public synchronized void clear() { map.clear(); diff --git a/source/net/yacy/cora/storage/ScoreMap.java b/source/net/yacy/cora/storage/ScoreMap.java index c94c4e5c3..7c76280af 100644 --- a/source/net/yacy/cora/storage/ScoreMap.java +++ b/source/net/yacy/cora/storage/ScoreMap.java @@ -26,7 +26,7 @@ package net.yacy.cora.storage; import java.util.Iterator; -public interface ScoreMap { +public interface ScoreMap extends Iterable { public void clear(); @@ -65,4 +65,5 @@ public interface ScoreMap { public void dec(final E obj); public void dec(final E obj, final int incrementScore); + public void inc(ScoreMap map); } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 6ce544a46..3f7562e44 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -402,7 +402,10 @@ dc_rights for (Map.Entry entry: anchors.entrySet()) { url = entry.getKey(); if (url == null) continue; - if (url.getHost() != null && thishost != null && url.getHost().equals(thishost)) this.inboundlinks.put(url, "anchor"); else this.outboundlinks.put(url, "anchor"); + if (url.getHost() != null && thishost != null && + url.getHost().endsWith(thishost) || + (thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4))) + ) this.inboundlinks.put(url, "anchor"); else this.outboundlinks.put(url, "anchor"); u = url.toNormalform(true, false); String name = entry.getValue().getProperty("name", ""); if (u.startsWith("mailto:")) { diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 52a6502b3..83b75d9ad 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -44,8 +44,10 @@ import java.util.regex.Pattern; import javax.swing.event.EventListenerList; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.storage.ClusteredScoreMap; import net.yacy.document.SentenceReader; import net.yacy.document.parser.htmlParser; +import net.yacy.document.parser.html.Evaluation.Element; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -60,33 +62,51 @@ public class ContentScraper extends AbstractScraper implements Scraper { // statics: for initialization of the HTMLFilterAbstractScraper private static final Set linkTags0 = new HashSet(9,0.99f); private static final Set linkTags1 = new HashSet(7,0.99f); - + + public enum TagType { + singleton, pair; + } + + public enum Tag { + html(TagType.singleton), // scraped as singleton to get attached properties like 'lang' + body(TagType.singleton), // scraped as singleton to get attached properties like 'class' + div(TagType.singleton), // scraped as singleton to get attached properties like 'id' + img(TagType.singleton), + base(TagType.singleton), + frame(TagType.singleton), + meta(TagType.singleton), + area(TagType.singleton), + link(TagType.singleton), + embed(TagType.singleton), //added by [MN] + param(TagType.singleton), //added by [MN] + + a(TagType.pair), + h1(TagType.pair), + h2(TagType.pair), + h3(TagType.pair), + h4(TagType.pair), + h5(TagType.pair), + h6(TagType.pair), + title(TagType.pair), + b(TagType.pair), + strong(TagType.pair), + i(TagType.pair), + li(TagType.pair), + iframe(TagType.pair), + script(TagType.pair); + + public TagType type; + private Tag(TagType type) { + this.type = type; + } + } + // all these tags must be given in lowercase, because the tags from the files are compared in lowercase static { - linkTags0.add("html"); // scraped as tag 0 to get attached properties like 'lang' - linkTags0.add("img"); - linkTags0.add("base"); - linkTags0.add("frame"); - linkTags0.add("meta"); - linkTags0.add("area"); - linkTags0.add("link"); - linkTags0.add("script"); - linkTags0.add("embed"); //added by [MN] - linkTags0.add("param"); //added by [MN] - - linkTags1.add("a"); - linkTags1.add("h1"); - linkTags1.add("h2"); - linkTags1.add("h3"); - linkTags1.add("h4"); - linkTags1.add("h5"); - linkTags1.add("h6"); - linkTags1.add("title"); - linkTags1.add("b"); - linkTags1.add("strong"); - linkTags1.add("i"); - linkTags1.add("li"); - linkTags1.add("iframe"); + for (Tag tag: Tag.values()) { + if (tag.type == TagType.singleton) linkTags0.add(tag.name()); + if (tag.type == TagType.pair) linkTags1.add(tag.name()); + } //