From b5ac29c9a5ccf28620e95f37a572fc9d8476b88f Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 30 Jan 2015 13:20:56 +0100 Subject: [PATCH] added a html field scraper which reads text from html entities of a given css class and extends a given vocabulary with a term consisting with the text content of the html class tag. Additionally, the term is included into the semantic facet of the document. This allows the creation of faceted search to documents without the pre-creation of vocabularies; instead, the vocabulary is created on-the-fly, possibly for use in other crawls. If any of the term scraping for a specific vocabulary is successful on a document, this vocabulary is excluded for auto-annotation on the page. To use this feature, do the following: - create a vocabulary on /Vocabulary_p.html (if not existent) - in /CrawlStartExpert.html you will now see the vocabularies as column in a table. The second column provides text fields where you can name the class of html entities where the literal of the corresponding vocabulary shall be scraped out - when doing a search, you will see the content of the scraped fields in a navigation facet for the given vocabulary --- htroot/ConfigHeuristics_p.java | 1 - htroot/CrawlStartExpert.html | 26 +++++- htroot/CrawlStartExpert.java | 22 ++++- htroot/Crawler_p.java | 29 +++++- htroot/QuickCrawlLink_p.java | 3 +- htroot/osm.java | 2 +- .../language/synonyms/AutotaggingLibrary.java | 19 +++- .../net/yacy/cora/lod/vocabulary/Tagging.java | 44 ++++----- source/net/yacy/crawler/CrawlSwitchboard.java | 27 ++++-- .../net/yacy/crawler/data/CrawlProfile.java | 19 +++- .../net/yacy/crawler/retrieval/Response.java | 3 +- source/net/yacy/data/BookmarkHelper.java | 3 +- .../net/yacy/data/ymark/YMarkAutoTagger.java | 2 +- .../net/yacy/data/ymark/YMarkCrawlStart.java | 3 +- source/net/yacy/document/Condenser.java | 38 ++++++-- source/net/yacy/document/Parser.java | 2 + source/net/yacy/document/TextParser.java | 17 ++-- .../net/yacy/document/VocabularyScraper.java | 90 +++++++++++++++++++ .../document/importer/MediawikiImporter.java | 3 +- .../net/yacy/document/parser/apkParser.java | 3 +- .../yacy/document/parser/audioTagParser.java | 3 +- .../parser/augment/AugmentParser.java | 5 +- .../net/yacy/document/parser/bzipParser.java | 5 +- .../net/yacy/document/parser/csvParser.java | 3 +- .../net/yacy/document/parser/docParser.java | 3 +- .../net/yacy/document/parser/dwgParser.java | 3 +- .../yacy/document/parser/genericParser.java | 3 +- .../net/yacy/document/parser/gzipParser.java | 5 +- .../document/parser/html/ContentScraper.java | 44 +++++---- .../parser/html/ScraperInputStream.java | 4 +- .../net/yacy/document/parser/htmlParser.java | 16 ++-- .../parser/images/genericImageParser.java | 5 +- .../parser/images/metadataImageParser.java | 5 +- .../document/parser/linkScraperParser.java | 5 +- source/net/yacy/document/parser/mmParser.java | 3 +- .../net/yacy/document/parser/odtParser.java | 3 +- .../net/yacy/document/parser/ooxmlParser.java | 3 +- .../net/yacy/document/parser/pdfParser.java | 5 +- .../net/yacy/document/parser/pptParser.java | 3 +- source/net/yacy/document/parser/psParser.java | 3 +- .../net/yacy/document/parser/rdfParser.java | 3 +- .../document/parser/rdfa/impl/RDFaParser.java | 11 +-- .../net/yacy/document/parser/rssParser.java | 3 +- .../net/yacy/document/parser/rtfParser.java | 3 +- .../yacy/document/parser/sevenzipParser.java | 5 +- .../yacy/document/parser/sidAudioParser.java | 3 +- .../yacy/document/parser/sitemapParser.java | 3 +- .../net/yacy/document/parser/swfParser.java | 3 +- .../net/yacy/document/parser/tarParser.java | 5 +- .../yacy/document/parser/torrentParser.java | 7 +- .../net/yacy/document/parser/vcfParser.java | 3 +- .../net/yacy/document/parser/vsdParser.java | 3 +- .../net/yacy/document/parser/xlsParser.java | 3 +- .../net/yacy/document/parser/zipParser.java | 5 +- .../http/servlets/YaCyDefaultServlet.java | 1 - .../net/yacy/repository/LoaderDispatcher.java | 2 +- source/net/yacy/search/Switchboard.java | 5 +- .../net/yacy/search/index/DocumentIndex.java | 5 +- source/net/yacy/search/index/Segment.java | 2 +- 59 files changed, 419 insertions(+), 141 deletions(-) create mode 100644 source/net/yacy/document/VocabularyScraper.java diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java index 6bd9768c9..02222c749 100644 --- a/htroot/ConfigHeuristics_p.java +++ b/htroot/ConfigHeuristics_p.java @@ -197,7 +197,6 @@ public class ConfigHeuristics_p { return prop; } - @SuppressWarnings("unused") private static void writeopensearchcfg(final Switchboard sb, final serverObjects post) { // read index schema table flags diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index d44849ff2..5760f1455 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -443,7 +443,7 @@
Robot Behaviour
-
+
info You are running YaCy in non-p2p mode and because YaCy can be used as replacement for commercial search appliances @@ -460,6 +460,30 @@
#(/agentSelect)# + + #(vocabularySelect)#:: +
+ Enrich Vocabulary +
+
+
+ info + You can use class names to enrich the terms of a vocabulary based on the text content that appears on web pages. Please write the names of classes into the matrix. + + + + #{vocabularyset}# + + + + + #{/vocabularyset}# +
VocabularyClass
#[name]#
+
+
+
+ #(/vocabularySelect)# +
Snapshot Creation
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java index 783518c8b..886f35470 100644 --- a/htroot/CrawlStartExpert.java +++ b/htroot/CrawlStartExpert.java @@ -25,12 +25,15 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.Html2Image; import net.yacy.crawler.data.CrawlProfile; +import net.yacy.document.LibraryProvider; import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; @@ -508,10 +511,23 @@ public class CrawlStartExpert { prop.put("agentSelect_list", agentNames.size()); } - prop.put("agentSelect_defaultAgentName", - ClientIdentification.yacyInternetCrawlerAgentName); - + prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName); + // ---------- Enrich Vocabulary + Collection vocs = LibraryProvider.autotagging.getVocabularies(); + if (vocs.size() == 0) { + prop.put("vocabularySelect", 0); + } else { + prop.put("vocabularySelect", 1); + int count = 0; + for (Tagging v: vocs) { + prop.put("vocabularySelect_vocabularyset_" + count + "_name", v.getName()); + prop.put("vocabularySelect_vocabularyset_" + count + "_value", ""); + count++; + } + prop.put("vocabularySelect_vocabularyset", count); + } + // ---------- Snapshot generation boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable(); boolean convertAvailable = Html2Image.convertAvailable(); diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 58c0a830a..e36940b6d 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -42,6 +42,8 @@ import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.cora.util.JSONException; +import net.yacy.cora.util.JSONObject; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.data.Cache; @@ -51,6 +53,7 @@ import net.yacy.crawler.retrieval.SitemapImporter; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.data.WorkTables; import net.yacy.document.Document; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.index.RowHandleSet; @@ -445,6 +448,27 @@ public class Crawler_p { boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage"); boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld"); + // get vocabulary scraper info + JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context + for (String key: post.keySet()) { + if (key.startsWith("vocabulary_")) { + if (key.endsWith("_class")) { + String vocabulary = key.substring(11, key.length() - 6); + String value = post.get(key); + if (value != null && value.length() > 0) { + JSONObject props; + try { + props = vocabulary_scraper.getJSONObject(vocabulary); + } catch (JSONException e) { + props = new JSONObject(); + vocabulary_scraper.put(vocabulary, props); + } + props.put("class", value); + } + } + } + } + // prepare a new crawling profile final CrawlProfile profile; byte[] handle; @@ -476,7 +500,8 @@ public class Crawler_p { snapshotsReplaceOld, cachePolicy, collection, - agentName); + agentName, + new VocabularyScraper(vocabulary_scraper)); handle = ASCII.getBytes(profile.handle()); // before we fire up a new crawl, we make sure that another crawl with the same name is not running @@ -559,7 +584,7 @@ public class Crawler_p { try { // check if the crawl filter works correctly Pattern.compile(newcrawlingMustMatch); - final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000); + final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper()); final Writer writer = new TransformerWriter(null, null, scraper, null, false); if (crawlingFile != null && crawlingFile.exists()) { FileUtils.copy(new FileInputStream(crawlingFile), writer); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index b699d4589..106b10151 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -155,7 +155,8 @@ public class QuickCrawlLink_p { -1, false, true, CacheStrategy.IFFRESH, collection, - ClientIdentification.yacyIntranetCrawlerAgentName); + ClientIdentification.yacyIntranetCrawlerAgentName, + null); sb.crawler.putActive(pe.handle().getBytes(), pe); } catch (final Exception e) { // mist diff --git a/htroot/osm.java b/htroot/osm.java index d7d35b060..42802a40a 100644 --- a/htroot/osm.java +++ b/htroot/osm.java @@ -29,7 +29,7 @@ import net.yacy.visualization.RasterPlotter.DrawMode; public class osm { - public static EncodedImage respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) { + public static EncodedImage respond(final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) { int zoom = 10; double lat = 50.11670d; diff --git a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java index 35fa9b768..54197e6bb 100644 --- a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java +++ b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java @@ -23,6 +23,7 @@ package net.yacy.cora.language.synonyms; import java.io.File; import java.io.IOException; import java.util.Collection; +import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -94,6 +95,13 @@ public class AutotaggingLibrary { return this.vocabularies.get(name); } + public Set getVocabularyNames() { + // this must return a clone of the set to prevent that the vocabularies are destroyed in a side effect + HashSet names = new HashSet<>(); + names.addAll(this.vocabularies.keySet()); + return names; + } + public Collection getVocabularies() { return this.vocabularies.values(); } @@ -143,13 +151,16 @@ public class AutotaggingLibrary { return 4; } - public Tagging.Metatag getTagFromTerm(String term) { + public Tagging.Metatag getTagFromTerm(Set vocabularies, String term) { if (this.vocabularies.isEmpty()) return null; Tagging.Metatag tag; term = Tagging.normalizeTerm(term); - for (Map.Entry v: this.vocabularies.entrySet()) { - tag = v.getValue().getMetatagFromSynonym(term); - if (tag != null) return tag; + for (String vocabularyName: vocabularies) { + Tagging t = this.vocabularies.get(vocabularyName); + if (t != null) { + tag = t.getMetatagFromSynonym(term); + if (tag != null) return tag; + } } return null; } diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java index 98e7af2c1..f642ca6f3 100644 --- a/source/net/yacy/cora/lod/vocabulary/Tagging.java +++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java @@ -275,32 +275,34 @@ public class Tagging { public void put(String term, String synonyms, String objectlink) throws IOException { if (this.propFile == null) return; - TempFile tmp = new TempFile(); - BlockingQueue list = Files.concurentLineReader(this.propFile); - String line; - boolean written = false; - try { - vocloop: while ((line = list.take()) != Files.POISON_LINE) { - String[] pl = parseLine(line); - if (pl == null) { - continue vocloop; + synchronized (this) { + TempFile tmp = new TempFile(); + BlockingQueue list = Files.concurentLineReader(this.propFile); + String line; + boolean written = false; + try { + vocloop: while ((line = list.take()) != Files.POISON_LINE) { + String[] pl = parseLine(line); + if (pl == null) { + continue vocloop; + } + if (pl[0].equals(term)) { + tmp.writer.write(term + (synonyms == null || synonyms.isEmpty() ? "" : ":" + synonyms) + (objectlink == null || objectlink.isEmpty() || objectlink.equals(this.objectspace + term) ? "" : "#" + objectlink) + "\n"); + written = true; + } else { + tmp.writer.write(pl[0] + (pl[1] == null || pl[1].isEmpty() ? "" : ":" + pl[1]) + (pl[2] == null || pl[2].isEmpty() || pl[2].equals(this.objectspace + pl[0]) ? "" : "#" + pl[2]) + "\n"); + } } - if (pl[0].equals(term)) { + if (!written) { tmp.writer.write(term + (synonyms == null || synonyms.isEmpty() ? "" : ":" + synonyms) + (objectlink == null || objectlink.isEmpty() || objectlink.equals(this.objectspace + term) ? "" : "#" + objectlink) + "\n"); - written = true; - } else { - tmp.writer.write(pl[0] + (pl[1] == null || pl[1].isEmpty() ? "" : ":" + pl[1]) + (pl[2] == null || pl[2].isEmpty() || pl[2].equals(this.objectspace + pl[0]) ? "" : "#" + pl[2]) + "\n"); } + } catch (final InterruptedException e) { } - if (!written) { - tmp.writer.write(term + (synonyms == null || synonyms.isEmpty() ? "" : ":" + synonyms) + (objectlink == null || objectlink.isEmpty() || objectlink.equals(this.objectspace + term) ? "" : "#" + objectlink) + "\n"); - } - } catch (final InterruptedException e) { + tmp.writer.close(); + this.propFile.delete(); + tmp.file.renameTo(this.propFile); + init(); } - tmp.writer.close(); - this.propFile.delete(); - tmp.file.renameTo(this.propFile); - init(); } public void delete(String term) throws IOException { diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 35f6b9305..4472c59e0 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -295,7 +295,8 @@ public final class CrawlSwitchboard { -1, false, true, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_PROXY, - ClientIdentification.yacyProxyAgentName); + ClientIdentification.yacyProxyAgentName, + null); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultProxyProfile.handle()), this.defaultProxyProfile); @@ -325,7 +326,8 @@ public final class CrawlSwitchboard { -1, false, true, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_REMOTE, - ClientIdentification.yacyInternetCrawlerAgentName); + ClientIdentification.yacyInternetCrawlerAgentName, + null); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); @@ -355,7 +357,8 @@ public final class CrawlSwitchboard { -1, false, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, - ClientIdentification.yacyIntranetCrawlerAgentName); + ClientIdentification.yacyIntranetCrawlerAgentName, + null); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); @@ -385,7 +388,8 @@ public final class CrawlSwitchboard { -1, false, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, - ClientIdentification.yacyIntranetCrawlerAgentName); + ClientIdentification.yacyIntranetCrawlerAgentName, + null); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); @@ -416,7 +420,8 @@ public final class CrawlSwitchboard { -1, false, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, - ClientIdentification.browserAgentName); + ClientIdentification.browserAgentName, + null); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); @@ -446,7 +451,8 @@ public final class CrawlSwitchboard { -1, false, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, - ClientIdentification.yacyIntranetCrawlerAgentName); + ClientIdentification.yacyIntranetCrawlerAgentName, + null); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); @@ -476,7 +482,8 @@ public final class CrawlSwitchboard { -1, false, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, - ClientIdentification.yacyIntranetCrawlerAgentName); + ClientIdentification.yacyIntranetCrawlerAgentName, + null); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); @@ -506,7 +513,8 @@ public final class CrawlSwitchboard { -1, false, true, CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_SURROGATE, - ClientIdentification.yacyIntranetCrawlerAgentName); + ClientIdentification.yacyIntranetCrawlerAgentName, + null); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); @@ -539,7 +547,8 @@ public final class CrawlSwitchboard { -1, false, true, CacheStrategy.NOCACHE, collection, - ClientIdentification.yacyIntranetCrawlerAgentName); + ClientIdentification.yacyIntranetCrawlerAgentName, + null); this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile); this.defaultPushProfiles.put(collection, genericPushProfile); return genericPushProfile; diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index a564dfb6a..f90b25a7f 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -45,6 +45,7 @@ import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.CrawlSwitchboard; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.data.word.Word; import net.yacy.search.query.QueryParams; import net.yacy.server.serverObjects; @@ -78,6 +79,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String REMOTE_INDEXING = "remoteIndexing"; public static final String CACHE_STRAGEGY = "cacheStrategy"; public static final String COLLECTIONS = "collections"; + public static final String SCRAPER = "scraper"; public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch"; public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch"; public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch"; @@ -99,6 +101,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null; private final Map doms; + private final VocabularyScraper scraper; /** * Constructor which creates CrawlPofile from parameters. @@ -151,7 +154,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M final boolean snapshotsReplaceOld, final CacheStrategy cacheStrategy, final String collections, - final String userAgentName) { + final String userAgentName, + final VocabularyScraper scraper) { super(40); if (name == null || name.isEmpty()) { throw new NullPointerException("name must not be null or empty"); @@ -189,18 +193,29 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld); put(CACHE_STRAGEGY, cacheStrategy.toString()); put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); + // we transform the scraper information into a JSON Array + this.scraper = scraper == null ? new VocabularyScraper() : scraper; + String jsonString = this.scraper.toString(); + assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString; + put(SCRAPER, jsonString); } /** - * Constructor which creats a CrawlProfile from values in a Map. + * Constructor which creates a CrawlProfile from values in a Map. * @param ext contains values */ public CrawlProfile(final Map ext) { super(ext == null ? 1 : ext.size()); if (ext != null) putAll(ext); this.doms = new ConcurrentHashMap(); + String jsonString = ext.get(SCRAPER); + this.scraper = jsonString == null || jsonString.length() == 0 ? new VocabularyScraper() : new VocabularyScraper(jsonString); } + public VocabularyScraper scraper() { + return this.scraper; + } + public void domInc(final String domain) { final AtomicInteger dp = this.doms.get(domain); if (dp == null) { diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 67039016a..615465199 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -44,6 +44,7 @@ import net.yacy.crawler.data.ResultURLs.EventOrigin; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.search.Switchboard; public class Response { @@ -864,7 +865,7 @@ public class Response { final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); try { - return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.request.depth(), this.content); + return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.depth(), this.content); } catch (final Exception e) { return null; } diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java index c93019db1..c10c144c1 100644 --- a/source/net/yacy/data/BookmarkHelper.java +++ b/source/net/yacy/data/BookmarkHelper.java @@ -52,6 +52,7 @@ import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.BookmarksDB.Bookmark; import net.yacy.data.BookmarksDB.Tag; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.word.Word; @@ -138,7 +139,7 @@ public class BookmarkHelper { final Set tags=ListManager.string2set(tag); //this allow multiple default tags try { //load the links - final ContentScraper scraper = new ContentScraper(baseURL, 10000); + final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper()); //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(input,writer); diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index 8a0be1e60..c80ff37a3 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -87,7 +87,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } //get words from document - final Map words = new Condenser(document, true, true, LibraryProvider.dymLib, false, false).words(); + final Map words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false).words(); // generate potential tags from document title, description and subject final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32; diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index 91183afcf..b14c10dc9 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -189,7 +189,8 @@ public class YMarkCrawlStart extends HashMap{ -1, false, true, CacheStrategy.IFFRESH, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, - ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard + ClientIdentification.yacyIntranetCrawlerAgentName, + null); // TODO: make this a default profile in CrawlSwitchboard sb.crawler.putActive(pe.handle().getBytes(), pe); return sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash.getBytes(), diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 10b4f6f77..06a2b2807 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -45,6 +45,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.language.synonyms.SynonymLibrary; @@ -91,6 +92,7 @@ public final class Condenser { public Condenser( final Document document, + final VocabularyScraper scraper, final boolean indexText, final boolean indexMedia, final WordCache meaningLib, @@ -122,7 +124,7 @@ public final class Condenser { if (indexText) { String text = document.getTextString(); if (findDatesInContent) this.dates_in_content = DateDetection.parse(text); - createCondensement(text, meaningLib, doAutotagging); + createCondensement(document.dc_source(), text, meaningLib, doAutotagging, scraper); // the phrase counter: // phrase 0 are words taken from the URL // phrase 1 is the MainTitle @@ -249,12 +251,12 @@ public final class Condenser { this.exact_signature = EnhancedTextProfileSignature.getSignatureLong(text); } - private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) { + private Condenser(final DigestURL root, final String text, final WordCache meaningLib, final boolean doAutotagging, final VocabularyScraper scraper) { this.languageIdentificator = null; // we don't need that here // analysis = new Properties(); this.words = new TreeMap(); this.synonyms = new HashSet(); - createCondensement(text, meaningLib, doAutotagging); + createCondensement(root, text, meaningLib, doAutotagging, scraper); } private void insertTextToWords( @@ -324,7 +326,7 @@ public final class Condenser { return this.languageIdentificator.getLanguage(); } - private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) { + private void createCondensement(final DigestURL root, final String text, final WordCache meaningLib, boolean doAutotagging, final VocabularyScraper scraper) { assert text != null; final Set currsentwords = new HashSet(); String word = ""; @@ -355,7 +357,29 @@ public final class Condenser { // get tags from autotagging if (doAutotagging) { - for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) { + Set vocabularyNames = LibraryProvider.autotagging.getVocabularyNames(); + //Collection vocabularies = LibraryProvider.autotagging.getVocabularies(); + //assert vocabularyNames.size() == vocabularies.size(); + Map vocMap = scraper.removeVocMap(root); + if (vocMap != null) { + for (Map.Entry entry: vocMap.entrySet()) { + String navigatorName = entry.getKey(); + String term = entry.getValue(); + vocabularyNames.remove(navigatorName); + Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName); + if (vocabulary != null) { + // extend the vocabulary + String obj = vocabulary.getObjectlink(term); + if (obj == null) try {vocabulary.put(term, "", root.toNormalform(true));} catch (IOException e) {} // this makes IO, be careful! + // create annotation + tag = vocabulary.getMetatagFromTerm(term); + Set tagset = new HashSet<>(); + tagset.add(tag); + this.tags.put(navigatorName, tagset); + } + } + } + if (vocabularyNames.size() > 0) for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) { // wordc is number of words that are tested StringBuilder sb = new StringBuilder(); if (wordc == 1) { @@ -368,7 +392,7 @@ public final class Condenser { } String testterm = sb.toString().trim(); //System.out.println("Testing: " + testterm); - tag = LibraryProvider.autotagging.getTagFromTerm(testterm); + tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm); if (tag != null) { String navigatorName = tag.getVocabularyName(); Set tagset = this.tags.get(navigatorName); @@ -461,7 +485,7 @@ public final class Condenser { public static Map getWords(final String text, final WordCache meaningLib) { // returns a word/indexWord relation map if (text == null) return null; - return new Condenser(text, meaningLib, false).words(); + return new Condenser(null, text, meaningLib, false, null).words(); } public static void main(final String[] args) { diff --git a/source/net/yacy/document/Parser.java b/source/net/yacy/document/Parser.java index 272666e8f..be7b49eba 100644 --- a/source/net/yacy/document/Parser.java +++ b/source/net/yacy/document/Parser.java @@ -48,6 +48,7 @@ public interface Parser { * @param url the url of the source * @param mimeType the mime type of the source, if known * @param charset the charset of the source, if known + * @param scraper an entity scraper to detect facets from text annotation context * @param source a input stream * @return a list of documents that result from parsing the source * @throws Parser.Failure @@ -57,6 +58,7 @@ public interface Parser { AnchorURL url, String mimeType, String charset, + VocabularyScraper scraper, InputStream source ) throws Parser.Failure, InterruptedException; diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index c731e3d9e..0898f3c35 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -166,6 +166,7 @@ public final class TextParser { final AnchorURL location, final String mimeType, final String charset, + final VocabularyScraper scraper, final int depth, final File sourceFile ) throws InterruptedException, Parser.Failure { @@ -180,7 +181,7 @@ public final class TextParser { throw new Parser.Failure(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); - docs = parseSource(location, mimeType, charset, depth, sourceFile.length(), sourceStream); + docs = parseSource(location, mimeType, charset, scraper, depth, sourceFile.length(), sourceStream); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -197,6 +198,7 @@ public final class TextParser { final AnchorURL location, String mimeType, final String charset, + final VocabularyScraper scraper, final int depth, final byte[] content ) throws Parser.Failure { @@ -212,7 +214,7 @@ public final class TextParser { } assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true); - Document[] docs = parseSource(location, mimeType, idioms, charset, depth, content); + Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, content); return docs; } @@ -221,6 +223,7 @@ public final class TextParser { final AnchorURL location, String mimeType, final String charset, + final VocabularyScraper scraper, final int depth, final long contentLength, final InputStream sourceStream @@ -241,7 +244,7 @@ public final class TextParser { // then we use only one stream-oriented parser. if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { // use a specific stream-oriented parser - return parseSource(location, mimeType, idioms.iterator().next(), charset, sourceStream); + return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, sourceStream); } // in case that we know more parsers we first transform the content into a byte[] and use that as base @@ -252,7 +255,7 @@ public final class TextParser { } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } - Document[] docs = parseSource(location, mimeType, idioms, charset, depth, b); + Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, b); return docs; } @@ -262,6 +265,7 @@ public final class TextParser { final String mimeType, final Parser parser, final String charset, + final VocabularyScraper scraper, final InputStream sourceStream ) throws Parser.Failure { if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream"); @@ -271,7 +275,7 @@ public final class TextParser { if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); try { - final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream); + final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, sourceStream); return docs; } catch (final Exception e) { throw new Parser.Failure("parser failed: " + parser.getName(), location); @@ -283,6 +287,7 @@ public final class TextParser { final String mimeType, final Set parsers, final String charset, + final VocabularyScraper scraper, final int depth, final byte[] sourceArray ) throws Parser.Failure { @@ -305,7 +310,7 @@ public final class TextParser { bis = new ByteArrayInputStream(sourceArray); } try { - docs = parser.parse(location, mimeType, documentCharset, bis); + docs = parser.parse(location, mimeType, documentCharset, scraper, bis); } catch (final Parser.Failure e) { failedParser.put(parser, e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); diff --git a/source/net/yacy/document/VocabularyScraper.java b/source/net/yacy/document/VocabularyScraper.java new file mode 100644 index 000000000..967a4afbd --- /dev/null +++ b/source/net/yacy/document/VocabularyScraper.java @@ -0,0 +1,90 @@ +/** + * VocabularyScraper + * Copyright 2015 by Michael Peter Christen + * First released 30.01.2015 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.util.JSONException; +import net.yacy.cora.util.JSONObject; +import net.yacy.kelondro.io.CharBuffer; + +public class VocabularyScraper { + + private final JSONObject scraperDefinition; + private Map classVocabulary; // a mapping from class names to the vocabulary where this class should be mapped + private final Map> vocMap; // a mapping from a document to a map from vocabularies to terms + + public VocabularyScraper() { + this.classVocabulary = null; + this.scraperDefinition = new JSONObject(); + this.vocMap = new ConcurrentHashMap<>(); + } + + public VocabularyScraper(JSONObject init) { + // init must be a property list of property lists: the key of the top property list is the name of the vocabulary, the name of the embedded property list is the entity class and the value of the embedded property is the entity name + this.scraperDefinition = init == null ? new JSONObject() : init; + this.vocMap = new ConcurrentHashMap<>(); + if (this.scraperDefinition.length() == 0) { + this.classVocabulary = null; + } else { + this.classVocabulary = new ConcurrentHashMap<>(); + for (String voc: this.scraperDefinition.keySet()) { + JSONObject props = this.scraperDefinition.getJSONObject(voc); + try { + String classtype = props.getString("class"); + this.classVocabulary.put(classtype, voc); + } catch (JSONException e) {} + } + if (this.classVocabulary.size() == 0) this.classVocabulary = null; + } + } + + public VocabularyScraper(String init) { + this(new JSONObject(init)); + } + + @Override + public String toString() { + return this.scraperDefinition.toString(); + } + + public void check(DigestURL root, String className, CharBuffer content) { + if (this.classVocabulary == null) return; + String voc = this.classVocabulary.get(className); + if (voc == null) return; + // record the mapping + ConcurrentHashMap vocmap = this.vocMap.get(root); + if (vocmap == null) { + synchronized (this) { + vocmap = new ConcurrentHashMap<>(); + this.vocMap.put(root, vocmap); + } + } + if (!vocmap.containsKey(voc)) vocmap.put(voc, content.toString()); // we put only the first occurrence of the entity into the vocmap + } + + public Map removeVocMap(DigestURL root) { + return this.vocMap.remove(root); + } + +} diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 1e9e7a6ef..9e6ba1116 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -62,6 +62,7 @@ import net.yacy.data.wiki.WikiParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.document.content.SurrogateReader; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; @@ -523,7 +524,7 @@ public class MediawikiImporter extends Thread implements Importer { public void genDocument() throws Parser.Failure { try { this.url = new AnchorURL(this.urlStub + this.title); - final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", 1, UTF8.getBytes(this.html)); + final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 1, UTF8.getBytes(this.html)); this.document = Document.mergeDocuments(this.url, "text/html", parsed); // the wiki parser is not able to find the proper title in the source text, so it must be set here this.document.setTitle(this.title); diff --git a/source/net/yacy/document/parser/apkParser.java b/source/net/yacy/document/parser/apkParser.java index 938bad85c..0eacb05f6 100644 --- a/source/net/yacy/document/parser/apkParser.java +++ b/source/net/yacy/document/parser/apkParser.java @@ -43,6 +43,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; public class apkParser extends AbstractParser implements Parser { @@ -53,7 +54,7 @@ public class apkParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { /* * things to discover: diff --git a/source/net/yacy/document/parser/audioTagParser.java b/source/net/yacy/document/parser/audioTagParser.java index 35d5c8c93..73195c0a0 100644 --- a/source/net/yacy/document/parser/audioTagParser.java +++ b/source/net/yacy/document/parser/audioTagParser.java @@ -41,6 +41,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import org.jaudiotagger.audio.AudioFile; import org.jaudiotagger.audio.AudioFileIO; @@ -70,7 +71,7 @@ public class audioTagParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { String filename = location.getFileName(); diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java index c970f5d4f..6b78cf0d3 100644 --- a/source/net/yacy/document/parser/augment/AugmentParser.java +++ b/source/net/yacy/document/parser/augment/AugmentParser.java @@ -13,6 +13,7 @@ import net.yacy.data.ymark.YMarkUtil; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.rdfa.impl.RDFaParser; import net.yacy.kelondro.blob.Tables; import net.yacy.search.Switchboard; @@ -37,9 +38,9 @@ public class AugmentParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(AnchorURL url, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { - Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source); + Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, scraper, source); for (final Document doc : htmlDocs) { /* analyze(doc, url, mimeType, charset); // enrich document text */ diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 373bc955d..4d2c9dd6f 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -36,6 +36,7 @@ import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; @@ -57,7 +58,7 @@ public class bzipParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; @@ -94,7 +95,7 @@ public class bzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, 999, tempFile); + docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java index f6476e37b..717aadf2b 100644 --- a/source/net/yacy/document/parser/csvParser.java +++ b/source/net/yacy/document/parser/csvParser.java @@ -38,6 +38,7 @@ import net.yacy.cora.util.CommonPattern; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; /** * a parser for comma-separated values @@ -52,7 +53,7 @@ public class csvParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { // construct a document using all cells of the document // the first row is used as headline // all lines are artificially terminated by a '.' to separate them as sentence for the condenser. diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 909be3fe3..556e956b3 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -35,6 +35,7 @@ import net.yacy.cora.util.CommonPattern; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import org.apache.poi.hwpf.extractor.WordExtractor; @@ -57,7 +58,7 @@ public class docParser extends AbstractParser implements Parser { @SuppressWarnings("deprecation") @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { final WordExtractor extractor; diff --git a/source/net/yacy/document/parser/dwgParser.java b/source/net/yacy/document/parser/dwgParser.java index e5980df64..66b902eeb 100644 --- a/source/net/yacy/document/parser/dwgParser.java +++ b/source/net/yacy/document/parser/dwgParser.java @@ -29,6 +29,7 @@ import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.MemoryControl; import org.apache.poi.util.StringUtil; @@ -60,7 +61,7 @@ public class dwgParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, true)) diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index 748767462..53e6e46cb 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -32,6 +32,7 @@ import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; /** * this parser can parse just anything because it uses only the uri/file/path information @@ -46,7 +47,7 @@ public class genericParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source1) + final String charset, final VocabularyScraper scraper, final InputStream source1) throws Parser.Failure, InterruptedException { String filename = location.getFileName(); final Document[] docs = new Document[]{new Document( diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 6f36eb541..5a57e219a 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -37,6 +37,7 @@ import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; @@ -55,7 +56,7 @@ public class gzipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; Document[] docs = null; @@ -79,7 +80,7 @@ public class gzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, 999, tempFile); + docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 02ee617b4..bcee1744b 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -59,6 +59,7 @@ import net.yacy.cora.storage.SizeLimitedSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.NumberTools; import net.yacy.document.SentenceReader; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.html.Evaluation.Element; import net.yacy.document.parser.images.genericImageParser; @@ -88,7 +89,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { public enum TagName { html(TagType.singleton), // scraped as singleton to get attached properties like 'lang' body(TagType.singleton), // scraped as singleton to get attached properties like 'class' - div(TagType.singleton), // scraped as singleton to get attached properties like 'id' img(TagType.singleton), base(TagType.singleton), frame(TagType.singleton), @@ -115,7 +115,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { i(TagType.pair), li(TagType.pair), script(TagType.pair), - style(TagType.pair); + span(TagType.pair), + div(TagType.pair); public TagType type; private TagName(final TagType type) { @@ -185,6 +186,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private double lon, lat; private AnchorURL canonical, publisher; private final int maxLinks; + private final VocabularyScraper vocabularyScraper; private int breadcrumbs; @@ -203,14 +205,21 @@ public class ContentScraper extends AbstractScraper implements Scraper { */ private final Evaluation evaluationScores; + /** + * scrape a document + * @param root the document root url + * @param maxLinks the maximum number of links to scapre + * @param classDetector a map from class names to vocabulary names to scrape content from the DOM with associated class name + */ @SuppressWarnings("unchecked") - public ContentScraper(final DigestURL root, int maxLinks) { + public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); assert root != null; this.root = root; this.maxLinks = maxLinks; + this.vocabularyScraper = vocabularyScraper; this.evaluationScores = new Evaluation(); this.rss = new SizeLimitedMap(maxLinks); this.css = new SizeLimitedMap(maxLinks); @@ -392,15 +401,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.frames.add(src); this.evaluationScores.match(Element.framepath, src.toNormalform(true)); } else if (tag.name.equalsIgnoreCase("body")) { - final String c = tag.opts.getProperty("class", EMPTY_STRING); - this.evaluationScores.match(Element.bodyclass, c); - } else if (tag.name.equalsIgnoreCase("div")) { - final String id = tag.opts.getProperty("id", EMPTY_STRING); - this.evaluationScores.match(Element.divid, id); - final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING); - if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) { - breadcrumbs++; - } + final String classprop = tag.opts.getProperty("class", EMPTY_STRING); + this.evaluationScores.match(Element.bodyclass, classprop); } else if (tag.name.equalsIgnoreCase("meta")) { final String content = tag.opts.getProperty("content", EMPTY_STRING); String name = tag.opts.getProperty("name", EMPTY_STRING); @@ -509,6 +511,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { @Override public void scrapeTag1(Tag tag) { + final String classprop = tag.opts.getProperty("class", EMPTY_STRING); + //System.out.println("class = " + classprop); + this.vocabularyScraper.check(this.root, classprop, tag.content); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { String href = tag.opts.getProperty("href", EMPTY_STRING); @@ -536,7 +541,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.evaluationScores.match(Element.apath, href); } final String h; - if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) { + if (tag.name.equalsIgnoreCase("div")) { + final String id = tag.opts.getProperty("id", EMPTY_STRING); + this.evaluationScores.match(Element.divid, id); + final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING); + if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) { + breadcrumbs++; + } + } else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) { h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.headlines[0].add(h); } else if((tag.name.equalsIgnoreCase("h2")) && (tag.content.length() < 1024)) { @@ -601,7 +613,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // start a new scraper to parse links inside this text // parsing the content - final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks); + final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper); final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false); try { FileUtils.copy(new CharArrayReader(inlineHtml), writer); @@ -1090,13 +1102,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (page == null) throw new IOException("no content in file " + file.toString()); // scrape document to look up charset - final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new DigestURL("http://localhost"),null,false, maxLinks); + final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); htmlFilter.close(); if (charset == null) charset = Charset.defaultCharset().toString(); // scrape content - final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks); + final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper()); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); writer.close(); diff --git a/source/net/yacy/document/parser/html/ScraperInputStream.java b/source/net/yacy/document/parser/html/ScraperInputStream.java index 764c4550d..b63a56cc4 100644 --- a/source/net/yacy/document/parser/html/ScraperInputStream.java +++ b/source/net/yacy/document/parser/html/ScraperInputStream.java @@ -37,6 +37,7 @@ import java.util.Properties; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.CommonPattern; +import net.yacy.document.VocabularyScraper; public class ScraperInputStream extends InputStream implements ScraperListener { @@ -59,6 +60,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener { public ScraperInputStream( final InputStream inStream, final String inputStreamCharset, + final VocabularyScraper vocabularyScraper, final DigestURL rooturl, final Transformer transformer, final boolean passbyIfBinarySuspect, @@ -68,7 +70,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener { this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize); this.bufferedIn.mark((int) preBufferSize); - final ContentScraper scraper = new ContentScraper(rooturl, maxLinks); + final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper); scraper.registerHtmlFilterEventListener(this); try { diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 881fa6cba..db1cf3a23 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -45,6 +45,7 @@ import net.yacy.cora.util.CommonPattern; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ScraperInputStream; @@ -86,13 +87,13 @@ public class htmlParser extends AbstractParser implements Parser { public Document[] parse( final AnchorURL location, final String mimeType, - final String documentCharset, + final String documentCharset, final VocabularyScraper vocscraper, final InputStream sourceStream) throws Parser.Failure, InterruptedException { try { // first get a document from the parsed html Charset[] detectedcharsetcontainer = new Charset[]{null}; - final ContentScraper scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks); + final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, sourceStream, maxLinks); // parseToScraper also detects/corrects/sets charset from html content tag final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); @@ -150,7 +151,7 @@ public class htmlParser extends AbstractParser implements Parser { return ppd; } - public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, String input, int maxLinks) throws IOException { + public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, String input, int maxLinks) throws IOException { Charset[] detectedcharsetcontainer = new Charset[]{null}; InputStream sourceStream; try { @@ -160,7 +161,7 @@ public class htmlParser extends AbstractParser implements Parser { } ContentScraper scraper; try { - scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks); + scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, sourceStream, maxLinks); } catch (Failure e) { throw new IOException(e.getMessage()); } @@ -170,6 +171,7 @@ public class htmlParser extends AbstractParser implements Parser { public static ContentScraper parseToScraper( final DigestURL location, final String documentCharset, + final VocabularyScraper vocabularyScraper, Charset[] detectedcharsetcontainer, InputStream sourceStream, final int maxLinks) throws Parser.Failure, IOException { @@ -186,7 +188,7 @@ public class htmlParser extends AbstractParser implements Parser { if (charset == null) { ScraperInputStream htmlFilter = null; try { - htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks); + htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks); sourceStream = htmlFilter; charset = htmlFilter.detectCharset(); } catch (final IOException e1) { @@ -220,7 +222,7 @@ public class htmlParser extends AbstractParser implements Parser { } // parsing the content - final ContentScraper scraper = new ContentScraper(location, maxLinks); + final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); try { FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]); @@ -322,7 +324,7 @@ public class htmlParser extends AbstractParser implements Parser { try { url = new AnchorURL(args[0]); final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null); - final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new ByteArrayInputStream(content)); + final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), new ByteArrayInputStream(content)); final String title = document[0].dc_title(); System.out.println(title); } catch (final MalformedURLException e) { diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 51df60dc6..db08ac783 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -53,6 +53,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.images.bmpParser.IMAGEMAP; import net.yacy.kelondro.util.FileUtils; @@ -92,7 +93,7 @@ public class genericImageParser extends AbstractParser implements Parser { public Document[] parse( final AnchorURL location, final String mimeType, - final String documentCharset, + final String documentCharset, final VocabularyScraper scraper, final InputStream sourceStream) throws Parser.Failure, InterruptedException { ImageInfo ii = null; @@ -314,7 +315,7 @@ public class genericImageParser extends AbstractParser implements Parser { AnchorURL uri; try { uri = new AnchorURL("http://localhost/" + image.getName()); - final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new FileInputStream(image)); + final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), new FileInputStream(image)); System.out.println(document[0].toString()); } catch (final MalformedURLException e) { e.printStackTrace(); diff --git a/source/net/yacy/document/parser/images/metadataImageParser.java b/source/net/yacy/document/parser/images/metadataImageParser.java index 90bbf6b1a..eef448faf 100644 --- a/source/net/yacy/document/parser/images/metadataImageParser.java +++ b/source/net/yacy/document/parser/images/metadataImageParser.java @@ -33,6 +33,7 @@ import com.drew.metadata.Directory; import com.drew.metadata.Metadata; import com.drew.metadata.Tag; import com.drew.metadata.exif.GpsDirectory; + import java.io.IOException; import java.io.InputStream; import java.io.BufferedInputStream; @@ -42,11 +43,13 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; + import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; /** @@ -84,7 +87,7 @@ public class metadataImageParser extends AbstractParser implements Parser { public Document[] parse( final AnchorURL location, final String mimeType, - final String documentCharset, + final String documentCharset, final VocabularyScraper scraper, final InputStream sourceStream) throws Parser.Failure, InterruptedException { String title = null; diff --git a/source/net/yacy/document/parser/linkScraperParser.java b/source/net/yacy/document/parser/linkScraperParser.java index a7a586361..4c0abbdd4 100644 --- a/source/net/yacy/document/parser/linkScraperParser.java +++ b/source/net/yacy/document/parser/linkScraperParser.java @@ -28,6 +28,7 @@ import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; /** * This parser is used if we know that the content is text but the exact format is unknown. @@ -59,10 +60,10 @@ public class linkScraperParser extends AbstractParser implements Parser { } @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { - Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, source); + Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, source); Document htmlParserDoc = htmlParserDocs == null ? null : Document.mergeDocuments(location, mimeType, htmlParserDocs); diff --git a/source/net/yacy/document/parser/mmParser.java b/source/net/yacy/document/parser/mmParser.java index d326bd02e..0781eea3c 100644 --- a/source/net/yacy/document/parser/mmParser.java +++ b/source/net/yacy/document/parser/mmParser.java @@ -39,6 +39,7 @@ import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import org.xml.sax.Attributes; import org.xml.sax.SAXException; @@ -71,7 +72,7 @@ public class mmParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { final StringBuilder sb = new StringBuilder(); diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 5e344f111..588d1432d 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -48,6 +48,7 @@ import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.xml.ODContentHandler; import net.yacy.document.parser.xml.ODMetaHandler; import net.yacy.kelondro.io.CharBuffer; @@ -215,7 +216,7 @@ public class odtParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 6371ea5c0..6535c95ed 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -48,6 +48,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.xml.ODContentHandler; import net.yacy.document.parser.xml.ODMetaHandler; import net.yacy.kelondro.io.CharBuffer; @@ -201,7 +202,7 @@ public class ooxmlParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index c6e0634db..52df35bba 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -59,6 +59,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; @@ -85,7 +86,7 @@ public class pdfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) @@ -375,7 +376,7 @@ public class pdfParser extends AbstractParser implements Parser { final AbstractParser parser = new pdfParser(); Document document = null; try { - document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new FileInputStream(pdfFile))); + document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), new FileInputStream(pdfFile))); } catch (final Parser.Failure e) { System.err.println("Cannot parse file " + pdfFile.getAbsolutePath()); ConcurrentLog.logException(e); diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index 78f4c2452..f9ef2397e 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -37,6 +37,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import org.apache.poi.hslf.extractor.PowerPointExtractor; @@ -62,7 +63,7 @@ public class pptParser extends AbstractParser implements Parser { */ @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) throws Parser.Failure, + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { try { /* diff --git a/source/net/yacy/document/parser/psParser.java b/source/net/yacy/document/parser/psParser.java index 0d3f5ea53..09cda757e 100644 --- a/source/net/yacy/document/parser/psParser.java +++ b/source/net/yacy/document/parser/psParser.java @@ -41,6 +41,7 @@ import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; @@ -258,7 +259,7 @@ public class psParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; diff --git a/source/net/yacy/document/parser/rdfParser.java b/source/net/yacy/document/parser/rdfParser.java index 14c8b26c3..6f3b6fee8 100644 --- a/source/net/yacy/document/parser/rdfParser.java +++ b/source/net/yacy/document/parser/rdfParser.java @@ -34,6 +34,7 @@ import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; public class rdfParser extends AbstractParser implements Parser { @@ -46,7 +47,7 @@ public class rdfParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL url, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java index e4cadf152..2a36f962d 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java @@ -23,6 +23,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.rdfa.IRDFaTriple; @@ -48,10 +49,10 @@ public class RDFaParser extends AbstractParser implements Parser { @Override public Document[] parse(AnchorURL url, String mimeType, - String charset, InputStream source) throws Failure, + String charset, final VocabularyScraper scraper, InputStream source) throws Failure, InterruptedException { - Document[] htmlDocs = parseHtml(url, mimeType, charset, source); + Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, source); // TODO: current hardcoded restriction: apply rdfa parser only on selected sources. @@ -97,12 +98,12 @@ public class RDFaParser extends AbstractParser implements Parser { } private Document[] parseHtml(AnchorURL url, String mimeType, - String charset, InputStream source) throws Failure, + String charset, VocabularyScraper scraper, InputStream source) throws Failure, InterruptedException { Document[] htmlDocs = null; try { - htmlDocs = this.hp.parse(url, mimeType, charset, source); + htmlDocs = this.hp.parse(url, mimeType, charset, scraper, source); source.reset(); } catch (final IOException e1) { @@ -179,7 +180,7 @@ public class RDFaParser extends AbstractParser implements Parser { if (aReader != null) { RDFaParser aParser = new RDFaParser(); try { - aParser.parse(new AnchorURL(args[0]),"","",aURL.openStream()); + aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), aURL.openStream()); } catch (final FileNotFoundException e) { e.printStackTrace(); } catch (final IOException e) { diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 9d88faad6..116adae0b 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -43,6 +43,7 @@ import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.html.ImageEntry; public class rssParser extends AbstractParser implements Parser { @@ -59,7 +60,7 @@ public class rssParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL url, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Failure, InterruptedException { RSSReader rssReader; try { diff --git a/source/net/yacy/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java index d03e7836b..06d7bd5ee 100644 --- a/source/net/yacy/document/parser/rtfParser.java +++ b/source/net/yacy/document/parser/rtfParser.java @@ -37,6 +37,7 @@ import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; public class rtfParser extends AbstractParser implements Parser { @@ -53,7 +54,7 @@ public class rtfParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { try { diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index 2dacd8ba9..5c22533aa 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -40,6 +40,7 @@ import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; import SevenZip.ArchiveExtractCallback; import SevenZip.IInStream; @@ -105,7 +106,7 @@ public class sevenzipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { try { final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); FileUtils.copy(source, cfos); @@ -171,7 +172,7 @@ public class sevenzipParser extends AbstractParser implements Parser { // below for reversion of the effects final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); - theDocs = TextParser.parseSource(url, mime, null, this.doc.getDepth() + 1, this.cfos.toByteArray()); + theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), this.doc.getDepth() + 1, this.cfos.toByteArray()); this.doc.addSubDocuments(theDocs); } diff --git a/source/net/yacy/document/parser/sidAudioParser.java b/source/net/yacy/document/parser/sidAudioParser.java index ad13aeca4..4f1cbf5c1 100644 --- a/source/net/yacy/document/parser/sidAudioParser.java +++ b/source/net/yacy/document/parser/sidAudioParser.java @@ -35,6 +35,7 @@ import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; // this is a new implementation of this parser idiom using multiple documents as result set @@ -58,7 +59,7 @@ public class sidAudioParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { try { final int available = source.available(); diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index af68a7658..ecc5eb393 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -51,6 +51,7 @@ import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.io.ByteCountInputStream; @@ -70,7 +71,7 @@ public class sitemapParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL url, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Failure, InterruptedException { final List docs = new ArrayList(); SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent); diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index c009782fb..ac1c9c2ce 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -37,6 +37,7 @@ import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import pt.tumba.parser.swf.SWF2HTML; public class swfParser extends AbstractParser implements Parser { @@ -56,7 +57,7 @@ public class swfParser extends AbstractParser implements Parser { */ @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index 79ccca964..e9bdb96bc 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -40,6 +40,7 @@ import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; import org.apache.tools.tar.TarEntry; @@ -61,7 +62,7 @@ public class tarParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { final List docacc = new ArrayList(); Document[] subDocs = null; @@ -90,7 +91,7 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, 999, tmp); + subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, scraper, 999, tmp); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index 5691d14c6..abe9caed4 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -40,6 +40,7 @@ import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.util.BDecoder; import net.yacy.kelondro.util.BDecoder.BObject; @@ -56,7 +57,7 @@ public class torrentParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL location, String mimeType, String charset, InputStream source) + public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { byte[] b = null; try { @@ -119,8 +120,8 @@ public class torrentParser extends AbstractParser implements Parser { try { byte[] b = FileUtils.read(new File(args[0])); torrentParser parser = new torrentParser(); - Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b)); - Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false, false); + Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), new ByteArrayInputStream(b)); + Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false); Map w = c.words(); for (Map.Entry e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); } catch (final IOException e) { diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 0144622ae..107e89feb 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -46,6 +46,7 @@ import net.yacy.cora.util.CommonPattern; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; /** * Vcard specification: http://www.imc.org/pdi/vcard-21.txt @@ -65,7 +66,7 @@ public class vcfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final InputStream source) + public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { try { diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java index 59c1e484a..9e53f1085 100644 --- a/source/net/yacy/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -37,6 +37,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpsf.SummaryInformation; @@ -66,7 +67,7 @@ public class vsdParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { Document theDoc = null; diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java index beffaf038..40c925493 100644 --- a/source/net/yacy/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -36,6 +36,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; @@ -68,7 +69,7 @@ public class xlsParser extends AbstractParser implements Parser { */ @Override public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final InputStream source) throws Parser.Failure, + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { return new XLSHSSFListener().parse(location, mimeType, charset, source); } diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index 2f381270a..2438354f1 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -38,6 +38,7 @@ import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; @@ -62,7 +63,7 @@ public class zipParser extends AbstractParser implements Parser { @Override public Document[] parse(final AnchorURL url, final String mimeType, - final String charset, final InputStream source) + final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) @@ -89,7 +90,7 @@ public class zipParser extends AbstractParser implements Parser { FileUtils.copy(zis, tmp, entry.getSize()); final DigestURL virtualURL = DigestURL.newURL(url, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); - docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, 999, tmp); + docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, 999, tmp); if (docs == null) continue; for (final Document d: docs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/http/servlets/YaCyDefaultServlet.java b/source/net/yacy/http/servlets/YaCyDefaultServlet.java index 9895662ee..a532da397 100644 --- a/source/net/yacy/http/servlets/YaCyDefaultServlet.java +++ b/source/net/yacy/http/servlets/YaCyDefaultServlet.java @@ -1045,7 +1045,6 @@ public class YaCyDefaultServlet extends HttpServlet { upload.setFileSizeMax(SIZE_FILE_THRESHOLD); try { // Parse the request to get form field items - @SuppressWarnings("unchecked") List fileItems = upload.parseRequest(request); // Process the uploaded file items Iterator i = fileItems.iterator(); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 6eaeac9ef..84a01a08d 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -418,7 +418,7 @@ public final class LoaderDispatcher { final String supportError = TextParser.supports(url, responseHeader.mime()); if (supportError != null) throw new IOException("no parser support: " + supportError); try { - documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.depth(), response.getContent()); + documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), response.depth(), response.getContent()); if (documents == null) throw new IOException("document == null"); } catch (final Exception e) { throw new IOException("parser error: " + e.getMessage()); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 53be77b38..3908a3995 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2570,6 +2570,7 @@ public final class Switchboard extends serverSwitch { new AnchorURL(response.url()), response.getMimeType(), response.getCharacterEncoding(), + response.profile().scraper(), response.depth(), response.getContent()); if ( documents == null ) { @@ -2750,7 +2751,7 @@ public final class Switchboard extends serverSwitch { for ( int i = 0; i < in.documents.length; i++ ) { condenser[i] = new Condenser( - in.documents[i], in.queueEntry.profile().indexText(), + in.documents[i], in.queueEntry.profile().scraper(), in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib, true, this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt)); @@ -3189,7 +3190,7 @@ public final class Switchboard extends serverSwitch { throw new Parser.Failure("indexing is denied", url); } final Condenser condenser = new Condenser( - document, true, true, LibraryProvider.dymLib, true, + document, null, true, true, LibraryProvider.dymLib, true, Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt)); ResultImages.registerImages(url, document, true); Switchboard.this.webStructure.generateCitationReference(url, document); diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 29a61bad2..a8ef16402 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -42,6 +42,7 @@ import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.WebgraphConfiguration; @@ -149,7 +150,7 @@ public class DocumentIndex extends Segment { length = -1; } try { - documents = TextParser.parseSource(url, null, null, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); + documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); } catch (final Exception e ) { throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage()); } @@ -158,7 +159,7 @@ public class DocumentIndex extends Segment { int c = 0; for ( final Document document : documents ) { if (document == null) continue; - final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true, true); + final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true); rows[c++] = super.storeDocument( url, diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index de2bdd757..b5bd460e2 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -761,7 +761,7 @@ public class Segment { } // get the word set Set words = null; - words = new Condenser(document, true, true, null, false, false).words().keySet(); + words = new Condenser(document, null, true, true, null, false, false).words().keySet(); // delete all word references int count = 0;