added a html field scraper which reads text from html entities of a

given css class and extends a given vocabulary with a term consisting
with the text content of the html class tag. Additionally, the term is
included into the semantic facet of the document. This allows the
creation of faceted search to documents without the pre-creation of
vocabularies; instead, the vocabulary is created on-the-fly, possibly
for use in other crawls. If any of the term scraping for a specific
vocabulary is successful on a document, this vocabulary is excluded for
auto-annotation on the page.

To use this feature, do the following:
- create a vocabulary on /Vocabulary_p.html (if not existent)
- in /CrawlStartExpert.html you will now see the vocabularies as column
in a table. The second column provides text fields where you can name
the class of html entities where the literal of the corresponding
vocabulary shall be scraped out
- when doing a search, you will see the content of the scraped fields in
a navigation facet for the given vocabulary
pull/1/head
Michael Peter Christen 10 years ago
parent 1cb290170e
commit b5ac29c9a5

@ -197,7 +197,6 @@ public class ConfigHeuristics_p {
return prop; return prop;
} }
@SuppressWarnings("unused")
private static void writeopensearchcfg(final Switchboard sb, final serverObjects post) { private static void writeopensearchcfg(final Switchboard sb, final serverObjects post) {
// read index schema table flags // read index schema table flags

@ -443,7 +443,7 @@
<fieldset> <fieldset>
<legend>Robot Behaviour</legend> <legend>Robot Behaviour</legend>
<dl> <dl>
<dt><label for="collection">Use Special User Agent and robot identification</label></dt> <dt><label>Use Special User Agent and robot identification</label></dt>
<dd> <dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;"> <span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
You are running YaCy in non-p2p mode and because YaCy can be used as replacement for commercial search appliances You are running YaCy in non-p2p mode and because YaCy can be used as replacement for commercial search appliances
@ -460,6 +460,30 @@
</dl> </dl>
</fieldset> </fieldset>
#(/agentSelect)# #(/agentSelect)#
#(vocabularySelect)#::
<fieldset>
<legend>Enrich Vocabulary</legend>
<dl>
<dt><label>Scraping Fields</label></dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
You can use class names to enrich the terms of a vocabulary based on the text content that appears on web pages. Please write the names of classes into the matrix.
</span></span>
<table class="table table-condensed">
<tr><th>Vocabulary</th><th>Class</th></tr>
#{vocabularyset}#
<tr>
<td>#[name]#</td>
<td><input name="vocabulary_#[name]#_class" id="vocabulary_#[name]#_class" type="text" size="55" maxlength="1028" value="#[value]#" /></td>
</tr>
#{/vocabularyset}#
</table>
</dd>
</dl>
</fieldset>
#(/vocabularySelect)#
<fieldset> <fieldset>
<legend>Snapshot Creation</legend> <legend>Snapshot Creation</legend>
<dl> <dl>

@ -25,12 +25,15 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.List; import java.util.List;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.Html2Image; import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.LibraryProvider;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
@ -508,10 +511,23 @@ public class CrawlStartExpert {
prop.put("agentSelect_list", agentNames.size()); prop.put("agentSelect_list", agentNames.size());
} }
prop.put("agentSelect_defaultAgentName", prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.yacyInternetCrawlerAgentName);
// ---------- Enrich Vocabulary
Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
if (vocs.size() == 0) {
prop.put("vocabularySelect", 0);
} else {
prop.put("vocabularySelect", 1);
int count = 0;
for (Tagging v: vocs) {
prop.put("vocabularySelect_vocabularyset_" + count + "_name", v.getName());
prop.put("vocabularySelect_vocabularyset_" + count + "_value", "");
count++;
}
prop.put("vocabularySelect_vocabularyset", count);
}
// ---------- Snapshot generation // ---------- Snapshot generation
boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable(); boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable();
boolean convertAvailable = Html2Image.convertAvailable(); boolean convertAvailable = Html2Image.convertAvailable();

@ -42,6 +42,8 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.JSONException;
import net.yacy.cora.util.JSONObject;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.Cache;
@ -51,6 +53,7 @@ import net.yacy.crawler.retrieval.SitemapImporter;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.data.WorkTables; import net.yacy.data.WorkTables;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter; import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
@ -445,6 +448,27 @@ public class Crawler_p {
boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage"); boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage");
boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld"); boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
// get vocabulary scraper info
JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
for (String key: post.keySet()) {
if (key.startsWith("vocabulary_")) {
if (key.endsWith("_class")) {
String vocabulary = key.substring(11, key.length() - 6);
String value = post.get(key);
if (value != null && value.length() > 0) {
JSONObject props;
try {
props = vocabulary_scraper.getJSONObject(vocabulary);
} catch (JSONException e) {
props = new JSONObject();
vocabulary_scraper.put(vocabulary, props);
}
props.put("class", value);
}
}
}
}
// prepare a new crawling profile // prepare a new crawling profile
final CrawlProfile profile; final CrawlProfile profile;
byte[] handle; byte[] handle;
@ -476,7 +500,8 @@ public class Crawler_p {
snapshotsReplaceOld, snapshotsReplaceOld,
cachePolicy, cachePolicy,
collection, collection,
agentName); agentName,
new VocabularyScraper(vocabulary_scraper));
handle = ASCII.getBytes(profile.handle()); handle = ASCII.getBytes(profile.handle());
// before we fire up a new crawl, we make sure that another crawl with the same name is not running // before we fire up a new crawl, we make sure that another crawl with the same name is not running
@ -559,7 +584,7 @@ public class Crawler_p {
try { try {
// check if the crawl filter works correctly // check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch); Pattern.compile(newcrawlingMustMatch);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000); final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper());
final Writer writer = new TransformerWriter(null, null, scraper, null, false); final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if (crawlingFile != null && crawlingFile.exists()) { if (crawlingFile != null && crawlingFile.exists()) {
FileUtils.copy(new FileInputStream(crawlingFile), writer); FileUtils.copy(new FileInputStream(crawlingFile), writer);

@ -155,7 +155,8 @@ public class QuickCrawlLink_p {
-1, false, true, -1, false, true,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
collection, collection,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName,
null);
sb.crawler.putActive(pe.handle().getBytes(), pe); sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) { } catch (final Exception e) {
// mist // mist

@ -29,7 +29,7 @@ import net.yacy.visualization.RasterPlotter.DrawMode;
public class osm { public class osm {
public static EncodedImage respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) { public static EncodedImage respond(final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
int zoom = 10; int zoom = 10;
double lat = 50.11670d; double lat = 50.11670d;

@ -23,6 +23,7 @@ package net.yacy.cora.language.synonyms;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
@ -94,6 +95,13 @@ public class AutotaggingLibrary {
return this.vocabularies.get(name); return this.vocabularies.get(name);
} }
public Set<String> getVocabularyNames() {
// this must return a clone of the set to prevent that the vocabularies are destroyed in a side effect
HashSet<String> names = new HashSet<>();
names.addAll(this.vocabularies.keySet());
return names;
}
public Collection<Tagging> getVocabularies() { public Collection<Tagging> getVocabularies() {
return this.vocabularies.values(); return this.vocabularies.values();
} }
@ -143,13 +151,16 @@ public class AutotaggingLibrary {
return 4; return 4;
} }
public Tagging.Metatag getTagFromTerm(String term) { public Tagging.Metatag getTagFromTerm(Set<String> vocabularies, String term) {
if (this.vocabularies.isEmpty()) return null; if (this.vocabularies.isEmpty()) return null;
Tagging.Metatag tag; Tagging.Metatag tag;
term = Tagging.normalizeTerm(term); term = Tagging.normalizeTerm(term);
for (Map.Entry<String, Tagging> v: this.vocabularies.entrySet()) { for (String vocabularyName: vocabularies) {
tag = v.getValue().getMetatagFromSynonym(term); Tagging t = this.vocabularies.get(vocabularyName);
if (tag != null) return tag; if (t != null) {
tag = t.getMetatagFromSynonym(term);
if (tag != null) return tag;
}
} }
return null; return null;
} }

@ -275,32 +275,34 @@ public class Tagging {
public void put(String term, String synonyms, String objectlink) throws IOException { public void put(String term, String synonyms, String objectlink) throws IOException {
if (this.propFile == null) return; if (this.propFile == null) return;
TempFile tmp = new TempFile(); synchronized (this) {
BlockingQueue<String> list = Files.concurentLineReader(this.propFile); TempFile tmp = new TempFile();
String line; BlockingQueue<String> list = Files.concurentLineReader(this.propFile);
boolean written = false; String line;
try { boolean written = false;
vocloop: while ((line = list.take()) != Files.POISON_LINE) { try {
String[] pl = parseLine(line); vocloop: while ((line = list.take()) != Files.POISON_LINE) {
if (pl == null) { String[] pl = parseLine(line);
continue vocloop; if (pl == null) {
continue vocloop;
}
if (pl[0].equals(term)) {
tmp.writer.write(term + (synonyms == null || synonyms.isEmpty() ? "" : ":" + synonyms) + (objectlink == null || objectlink.isEmpty() || objectlink.equals(this.objectspace + term) ? "" : "#" + objectlink) + "\n");
written = true;
} else {
tmp.writer.write(pl[0] + (pl[1] == null || pl[1].isEmpty() ? "" : ":" + pl[1]) + (pl[2] == null || pl[2].isEmpty() || pl[2].equals(this.objectspace + pl[0]) ? "" : "#" + pl[2]) + "\n");
}
} }
if (pl[0].equals(term)) { if (!written) {
tmp.writer.write(term + (synonyms == null || synonyms.isEmpty() ? "" : ":" + synonyms) + (objectlink == null || objectlink.isEmpty() || objectlink.equals(this.objectspace + term) ? "" : "#" + objectlink) + "\n"); tmp.writer.write(term + (synonyms == null || synonyms.isEmpty() ? "" : ":" + synonyms) + (objectlink == null || objectlink.isEmpty() || objectlink.equals(this.objectspace + term) ? "" : "#" + objectlink) + "\n");
written = true;
} else {
tmp.writer.write(pl[0] + (pl[1] == null || pl[1].isEmpty() ? "" : ":" + pl[1]) + (pl[2] == null || pl[2].isEmpty() || pl[2].equals(this.objectspace + pl[0]) ? "" : "#" + pl[2]) + "\n");
} }
} catch (final InterruptedException e) {
} }
if (!written) { tmp.writer.close();
tmp.writer.write(term + (synonyms == null || synonyms.isEmpty() ? "" : ":" + synonyms) + (objectlink == null || objectlink.isEmpty() || objectlink.equals(this.objectspace + term) ? "" : "#" + objectlink) + "\n"); this.propFile.delete();
} tmp.file.renameTo(this.propFile);
} catch (final InterruptedException e) { init();
} }
tmp.writer.close();
this.propFile.delete();
tmp.file.renameTo(this.propFile);
init();
} }
public void delete(String term) throws IOException { public void delete(String term) throws IOException {

@ -295,7 +295,8 @@ public final class CrawlSwitchboard {
-1, false, true, -1, false, true,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY, "robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName); ClientIdentification.yacyProxyAgentName,
null);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()), UTF8.getBytes(this.defaultProxyProfile.handle()),
this.defaultProxyProfile); this.defaultProxyProfile);
@ -325,7 +326,8 @@ public final class CrawlSwitchboard {
-1, false, true, -1, false, true,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE, "robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName); ClientIdentification.yacyInternetCrawlerAgentName,
null);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()), UTF8.getBytes(this.defaultRemoteProfile.handle()),
this.defaultRemoteProfile); this.defaultRemoteProfile);
@ -355,7 +357,8 @@ public final class CrawlSwitchboard {
-1, false, true, -1, false, true,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
this.defaultTextSnippetLocalProfile); this.defaultTextSnippetLocalProfile);
@ -385,7 +388,8 @@ public final class CrawlSwitchboard {
-1, false, true, -1, false, true,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile); this.defaultTextSnippetGlobalProfile);
@ -416,7 +420,8 @@ public final class CrawlSwitchboard {
-1, false, true, -1, false, true,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName); ClientIdentification.browserAgentName,
null);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile); this.defaultTextSnippetGlobalProfile);
@ -446,7 +451,8 @@ public final class CrawlSwitchboard {
-1, false, true, -1, false, true,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
this.defaultMediaSnippetLocalProfile); this.defaultMediaSnippetLocalProfile);
@ -476,7 +482,8 @@ public final class CrawlSwitchboard {
-1, false, true, -1, false, true,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
this.defaultMediaSnippetGlobalProfile); this.defaultMediaSnippetGlobalProfile);
@ -506,7 +513,8 @@ public final class CrawlSwitchboard {
-1, false, true, -1, false, true,
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE, "robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()), UTF8.getBytes(this.defaultSurrogateProfile.handle()),
this.defaultSurrogateProfile); this.defaultSurrogateProfile);
@ -539,7 +547,8 @@ public final class CrawlSwitchboard {
-1, false, true, -1, false, true,
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
collection, collection,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile); this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);
this.defaultPushProfiles.put(collection, genericPushProfile); this.defaultPushProfiles.put(collection, genericPushProfile);
return genericPushProfile; return genericPushProfile;

@ -45,6 +45,7 @@ import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.search.query.QueryParams; import net.yacy.search.query.QueryParams;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
@ -78,6 +79,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String REMOTE_INDEXING = "remoteIndexing"; public static final String REMOTE_INDEXING = "remoteIndexing";
public static final String CACHE_STRAGEGY = "cacheStrategy"; public static final String CACHE_STRAGEGY = "cacheStrategy";
public static final String COLLECTIONS = "collections"; public static final String COLLECTIONS = "collections";
public static final String SCRAPER = "scraper";
public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch"; public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch";
public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch"; public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch";
public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch"; public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch";
@ -99,6 +101,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null; private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
private final Map<String, AtomicInteger> doms; private final Map<String, AtomicInteger> doms;
private final VocabularyScraper scraper;
/** /**
* Constructor which creates CrawlPofile from parameters. * Constructor which creates CrawlPofile from parameters.
@ -151,7 +154,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean snapshotsReplaceOld, final boolean snapshotsReplaceOld,
final CacheStrategy cacheStrategy, final CacheStrategy cacheStrategy,
final String collections, final String collections,
final String userAgentName) { final String userAgentName,
final VocabularyScraper scraper) {
super(40); super(40);
if (name == null || name.isEmpty()) { if (name == null || name.isEmpty()) {
throw new NullPointerException("name must not be null or empty"); throw new NullPointerException("name must not be null or empty");
@ -189,18 +193,29 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld); put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld);
put(CACHE_STRAGEGY, cacheStrategy.toString()); put(CACHE_STRAGEGY, cacheStrategy.toString());
put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the scraper information into a JSON Array
this.scraper = scraper == null ? new VocabularyScraper() : scraper;
String jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
put(SCRAPER, jsonString);
} }
/** /**
* Constructor which creats a CrawlProfile from values in a Map. * Constructor which creates a CrawlProfile from values in a Map.
* @param ext contains values * @param ext contains values
*/ */
public CrawlProfile(final Map<String, String> ext) { public CrawlProfile(final Map<String, String> ext) {
super(ext == null ? 1 : ext.size()); super(ext == null ? 1 : ext.size());
if (ext != null) putAll(ext); if (ext != null) putAll(ext);
this.doms = new ConcurrentHashMap<String, AtomicInteger>(); this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String jsonString = ext.get(SCRAPER);
this.scraper = jsonString == null || jsonString.length() == 0 ? new VocabularyScraper() : new VocabularyScraper(jsonString);
} }
public VocabularyScraper scraper() {
return this.scraper;
}
public void domInc(final String domain) { public void domInc(final String domain) {
final AtomicInteger dp = this.doms.get(domain); final AtomicInteger dp = this.doms.get(domain);
if (dp == null) { if (dp == null) {

@ -44,6 +44,7 @@ import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
public class Response { public class Response {
@ -864,7 +865,7 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try { try {
return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.request.depth(), this.content); return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.depth(), this.content);
} catch (final Exception e) { } catch (final Exception e) {
return null; return null;
} }

@ -52,6 +52,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.BookmarksDB.Bookmark; import net.yacy.data.BookmarksDB.Bookmark;
import net.yacy.data.BookmarksDB.Tag; import net.yacy.data.BookmarksDB.Tag;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter; import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
@ -138,7 +139,7 @@ public class BookmarkHelper {
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try { try {
//load the links //load the links
final ContentScraper scraper = new ContentScraper(baseURL, 10000); final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper());
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, null, false); final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(input,writer); FileUtils.copy(input,writer);

@ -87,7 +87,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
} }
//get words from document //get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, false, false).words(); final Map<String, Word> words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false).words();
// generate potential tags from document title, description and subject // generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32; final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32;

@ -189,7 +189,8 @@ public class YMarkCrawlStart extends HashMap<String,String>{
-1, false, true, -1, false, true,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard ClientIdentification.yacyIntranetCrawlerAgentName,
null); // TODO: make this a default profile in CrawlSwitchboard
sb.crawler.putActive(pe.handle().getBytes(), pe); sb.crawler.putActive(pe.handle().getBytes(), pe);
return sb.crawlStacker.stackCrawl(new Request( return sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(), sb.peers.mySeed().hash.getBytes(),

@ -45,6 +45,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.language.synonyms.SynonymLibrary;
@ -91,6 +92,7 @@ public final class Condenser {
public Condenser( public Condenser(
final Document document, final Document document,
final VocabularyScraper scraper,
final boolean indexText, final boolean indexText,
final boolean indexMedia, final boolean indexMedia,
final WordCache meaningLib, final WordCache meaningLib,
@ -122,7 +124,7 @@ public final class Condenser {
if (indexText) { if (indexText) {
String text = document.getTextString(); String text = document.getTextString();
if (findDatesInContent) this.dates_in_content = DateDetection.parse(text); if (findDatesInContent) this.dates_in_content = DateDetection.parse(text);
createCondensement(text, meaningLib, doAutotagging); createCondensement(document.dc_source(), text, meaningLib, doAutotagging, scraper);
// the phrase counter: // the phrase counter:
// phrase 0 are words taken from the URL // phrase 0 are words taken from the URL
// phrase 1 is the MainTitle // phrase 1 is the MainTitle
@ -249,12 +251,12 @@ public final class Condenser {
this.exact_signature = EnhancedTextProfileSignature.getSignatureLong(text); this.exact_signature = EnhancedTextProfileSignature.getSignatureLong(text);
} }
private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) { private Condenser(final DigestURL root, final String text, final WordCache meaningLib, final boolean doAutotagging, final VocabularyScraper scraper) {
this.languageIdentificator = null; // we don't need that here this.languageIdentificator = null; // we don't need that here
// analysis = new Properties(); // analysis = new Properties();
this.words = new TreeMap<String, Word>(); this.words = new TreeMap<String, Word>();
this.synonyms = new HashSet<String>(); this.synonyms = new HashSet<String>();
createCondensement(text, meaningLib, doAutotagging); createCondensement(root, text, meaningLib, doAutotagging, scraper);
} }
private void insertTextToWords( private void insertTextToWords(
@ -324,7 +326,7 @@ public final class Condenser {
return this.languageIdentificator.getLanguage(); return this.languageIdentificator.getLanguage();
} }
private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) { private void createCondensement(final DigestURL root, final String text, final WordCache meaningLib, boolean doAutotagging, final VocabularyScraper scraper) {
assert text != null; assert text != null;
final Set<String> currsentwords = new HashSet<String>(); final Set<String> currsentwords = new HashSet<String>();
String word = ""; String word = "";
@ -355,7 +357,29 @@ public final class Condenser {
// get tags from autotagging // get tags from autotagging
if (doAutotagging) { if (doAutotagging) {
for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) { Set<String> vocabularyNames = LibraryProvider.autotagging.getVocabularyNames();
//Collection<Tagging> vocabularies = LibraryProvider.autotagging.getVocabularies();
//assert vocabularyNames.size() == vocabularies.size();
Map<String, String> vocMap = scraper.removeVocMap(root);
if (vocMap != null) {
for (Map.Entry<String, String> entry: vocMap.entrySet()) {
String navigatorName = entry.getKey();
String term = entry.getValue();
vocabularyNames.remove(navigatorName);
Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);
if (vocabulary != null) {
// extend the vocabulary
String obj = vocabulary.getObjectlink(term);
if (obj == null) try {vocabulary.put(term, "", root.toNormalform(true));} catch (IOException e) {} // this makes IO, be careful!
// create annotation
tag = vocabulary.getMetatagFromTerm(term);
Set<Tagging.Metatag> tagset = new HashSet<>();
tagset.add(tag);
this.tags.put(navigatorName, tagset);
}
}
}
if (vocabularyNames.size() > 0) for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
// wordc is number of words that are tested // wordc is number of words that are tested
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
if (wordc == 1) { if (wordc == 1) {
@ -368,7 +392,7 @@ public final class Condenser {
} }
String testterm = sb.toString().trim(); String testterm = sb.toString().trim();
//System.out.println("Testing: " + testterm); //System.out.println("Testing: " + testterm);
tag = LibraryProvider.autotagging.getTagFromTerm(testterm); tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);
if (tag != null) { if (tag != null) {
String navigatorName = tag.getVocabularyName(); String navigatorName = tag.getVocabularyName();
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName); Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
@ -461,7 +485,7 @@ public final class Condenser {
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) { public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
// returns a word/indexWord relation map // returns a word/indexWord relation map
if (text == null) return null; if (text == null) return null;
return new Condenser(text, meaningLib, false).words(); return new Condenser(null, text, meaningLib, false, null).words();
} }
public static void main(final String[] args) { public static void main(final String[] args) {

@ -48,6 +48,7 @@ public interface Parser {
* @param url the url of the source * @param url the url of the source
* @param mimeType the mime type of the source, if known * @param mimeType the mime type of the source, if known
* @param charset the charset of the source, if known * @param charset the charset of the source, if known
* @param scraper an entity scraper to detect facets from text annotation context
* @param source a input stream * @param source a input stream
* @return a list of documents that result from parsing the source * @return a list of documents that result from parsing the source
* @throws Parser.Failure * @throws Parser.Failure
@ -57,6 +58,7 @@ public interface Parser {
AnchorURL url, AnchorURL url,
String mimeType, String mimeType,
String charset, String charset,
VocabularyScraper scraper,
InputStream source InputStream source
) throws Parser.Failure, InterruptedException; ) throws Parser.Failure, InterruptedException;

@ -166,6 +166,7 @@ public final class TextParser {
final AnchorURL location, final AnchorURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
final VocabularyScraper scraper,
final int depth, final int depth,
final File sourceFile final File sourceFile
) throws InterruptedException, Parser.Failure { ) throws InterruptedException, Parser.Failure {
@ -180,7 +181,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location); throw new Parser.Failure(errorMsg, location);
} }
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, depth, sourceFile.length(), sourceStream); docs = parseSource(location, mimeType, charset, scraper, depth, sourceFile.length(), sourceStream);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -197,6 +198,7 @@ public final class TextParser {
final AnchorURL location, final AnchorURL location,
String mimeType, String mimeType,
final String charset, final String charset,
final VocabularyScraper scraper,
final int depth, final int depth,
final byte[] content final byte[] content
) throws Parser.Failure { ) throws Parser.Failure {
@ -212,7 +214,7 @@ public final class TextParser {
} }
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true); assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
Document[] docs = parseSource(location, mimeType, idioms, charset, depth, content); Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, content);
return docs; return docs;
} }
@ -221,6 +223,7 @@ public final class TextParser {
final AnchorURL location, final AnchorURL location,
String mimeType, String mimeType,
final String charset, final String charset,
final VocabularyScraper scraper,
final int depth, final int depth,
final long contentLength, final long contentLength,
final InputStream sourceStream final InputStream sourceStream
@ -241,7 +244,7 @@ public final class TextParser {
// then we use only one stream-oriented parser. // then we use only one stream-oriented parser.
if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
// use a specific stream-oriented parser // use a specific stream-oriented parser
return parseSource(location, mimeType, idioms.iterator().next(), charset, sourceStream); return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, sourceStream);
} }
// in case that we know more parsers we first transform the content into a byte[] and use that as base // in case that we know more parsers we first transform the content into a byte[] and use that as base
@ -252,7 +255,7 @@ public final class TextParser {
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
} }
Document[] docs = parseSource(location, mimeType, idioms, charset, depth, b); Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, b);
return docs; return docs;
} }
@ -262,6 +265,7 @@ public final class TextParser {
final String mimeType, final String mimeType,
final Parser parser, final Parser parser,
final String charset, final String charset,
final VocabularyScraper scraper,
final InputStream sourceStream final InputStream sourceStream
) throws Parser.Failure { ) throws Parser.Failure {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream"); if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
@ -271,7 +275,7 @@ public final class TextParser {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
try { try {
final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream); final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, sourceStream);
return docs; return docs;
} catch (final Exception e) { } catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location); throw new Parser.Failure("parser failed: " + parser.getName(), location);
@ -283,6 +287,7 @@ public final class TextParser {
final String mimeType, final String mimeType,
final Set<Parser> parsers, final Set<Parser> parsers,
final String charset, final String charset,
final VocabularyScraper scraper,
final int depth, final int depth,
final byte[] sourceArray final byte[] sourceArray
) throws Parser.Failure { ) throws Parser.Failure {
@ -305,7 +310,7 @@ public final class TextParser {
bis = new ByteArrayInputStream(sourceArray); bis = new ByteArrayInputStream(sourceArray);
} }
try { try {
docs = parser.parse(location, mimeType, documentCharset, bis); docs = parser.parse(location, mimeType, documentCharset, scraper, bis);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
failedParser.put(parser, e); failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);

@ -0,0 +1,90 @@
/**
* VocabularyScraper
* Copyright 2015 by Michael Peter Christen
* First released 30.01.2015 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.JSONException;
import net.yacy.cora.util.JSONObject;
import net.yacy.kelondro.io.CharBuffer;
public class VocabularyScraper {
private final JSONObject scraperDefinition;
private Map<String, String> classVocabulary; // a mapping from class names to the vocabulary where this class should be mapped
private final Map<DigestURL, ConcurrentHashMap<String, String>> vocMap; // a mapping from a document to a map from vocabularies to terms
public VocabularyScraper() {
this.classVocabulary = null;
this.scraperDefinition = new JSONObject();
this.vocMap = new ConcurrentHashMap<>();
}
public VocabularyScraper(JSONObject init) {
// init must be a property list of property lists: the key of the top property list is the name of the vocabulary, the name of the embedded property list is the entity class and the value of the embedded property is the entity name
this.scraperDefinition = init == null ? new JSONObject() : init;
this.vocMap = new ConcurrentHashMap<>();
if (this.scraperDefinition.length() == 0) {
this.classVocabulary = null;
} else {
this.classVocabulary = new ConcurrentHashMap<>();
for (String voc: this.scraperDefinition.keySet()) {
JSONObject props = this.scraperDefinition.getJSONObject(voc);
try {
String classtype = props.getString("class");
this.classVocabulary.put(classtype, voc);
} catch (JSONException e) {}
}
if (this.classVocabulary.size() == 0) this.classVocabulary = null;
}
}
public VocabularyScraper(String init) {
this(new JSONObject(init));
}
@Override
public String toString() {
return this.scraperDefinition.toString();
}
public void check(DigestURL root, String className, CharBuffer content) {
if (this.classVocabulary == null) return;
String voc = this.classVocabulary.get(className);
if (voc == null) return;
// record the mapping
ConcurrentHashMap<String, String> vocmap = this.vocMap.get(root);
if (vocmap == null) {
synchronized (this) {
vocmap = new ConcurrentHashMap<>();
this.vocMap.put(root, vocmap);
}
}
if (!vocmap.containsKey(voc)) vocmap.put(voc, content.toString()); // we put only the first occurrence of the entity into the vocmap
}
public Map<String, String> removeVocMap(DigestURL root) {
return this.vocMap.remove(root);
}
}

@ -62,6 +62,7 @@ import net.yacy.data.wiki.WikiParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.content.SurrogateReader; import net.yacy.document.content.SurrogateReader;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
@ -523,7 +524,7 @@ public class MediawikiImporter extends Thread implements Importer {
public void genDocument() throws Parser.Failure { public void genDocument() throws Parser.Failure {
try { try {
this.url = new AnchorURL(this.urlStub + this.title); this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", 1, UTF8.getBytes(this.html)); final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 1, UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed); this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here // the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title); this.document.setTitle(this.title);

@ -43,6 +43,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
public class apkParser extends AbstractParser implements Parser { public class apkParser extends AbstractParser implements Parser {
@ -53,7 +54,7 @@ public class apkParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
/* /*
* things to discover: * things to discover:

@ -41,6 +41,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.jaudiotagger.audio.AudioFile; import org.jaudiotagger.audio.AudioFile;
import org.jaudiotagger.audio.AudioFileIO; import org.jaudiotagger.audio.AudioFileIO;
@ -70,7 +71,7 @@ public class audioTagParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
String filename = location.getFileName(); String filename = location.getFileName();

@ -13,6 +13,7 @@ import net.yacy.data.ymark.YMarkUtil;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.rdfa.impl.RDFaParser; import net.yacy.document.parser.rdfa.impl.RDFaParser;
import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.blob.Tables;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
@ -37,9 +38,9 @@ public class AugmentParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(AnchorURL url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(AnchorURL url, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException {
Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source); Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, scraper, source);
for (final Document doc : htmlDocs) { for (final Document doc : htmlDocs) {
/* analyze(doc, url, mimeType, charset); // enrich document text */ /* analyze(doc, url, mimeType, charset); // enrich document text */

@ -36,6 +36,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
@ -57,7 +58,7 @@ public class bzipParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;
@ -94,7 +95,7 @@ public class bzipParser extends AbstractParser implements Parser {
out.close(); out.close();
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, 999, tempFile); docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -38,6 +38,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
/** /**
* a parser for comma-separated values * a parser for comma-separated values
@ -52,7 +53,7 @@ public class csvParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(AnchorURL location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException {
// construct a document using all cells of the document // construct a document using all cells of the document
// the first row is used as headline // the first row is used as headline
// all lines are artificially terminated by a '.' to separate them as sentence for the condenser. // all lines are artificially terminated by a '.' to separate them as sentence for the condenser.

@ -35,6 +35,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.extractor.WordExtractor;
@ -57,7 +58,7 @@ public class docParser extends AbstractParser implements Parser {
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
final WordExtractor extractor; final WordExtractor extractor;

@ -29,6 +29,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
import org.apache.poi.util.StringUtil; import org.apache.poi.util.StringUtil;
@ -60,7 +61,7 @@ public class dwgParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser // check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, true)) if (!MemoryControl.request(200 * 1024 * 1024, true))

@ -32,6 +32,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
/** /**
* this parser can parse just anything because it uses only the uri/file/path information * this parser can parse just anything because it uses only the uri/file/path information
@ -46,7 +47,7 @@ public class genericParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source1) final String charset, final VocabularyScraper scraper, final InputStream source1)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
String filename = location.getFileName(); String filename = location.getFileName();
final Document[] docs = new Document[]{new Document( final Document[] docs = new Document[]{new Document(

@ -37,6 +37,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -55,7 +56,7 @@ public class gzipParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;
Document[] docs = null; Document[] docs = null;
@ -79,7 +80,7 @@ public class gzipParser extends AbstractParser implements Parser {
out.close(); out.close();
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, 999, tempFile); docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -59,6 +59,7 @@ import net.yacy.cora.storage.SizeLimitedSet;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.NumberTools; import net.yacy.cora.util.NumberTools;
import net.yacy.document.SentenceReader; import net.yacy.document.SentenceReader;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.Evaluation.Element; import net.yacy.document.parser.html.Evaluation.Element;
import net.yacy.document.parser.images.genericImageParser; import net.yacy.document.parser.images.genericImageParser;
@ -88,7 +89,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public enum TagName { public enum TagName {
html(TagType.singleton), // scraped as singleton to get attached properties like 'lang' html(TagType.singleton), // scraped as singleton to get attached properties like 'lang'
body(TagType.singleton), // scraped as singleton to get attached properties like 'class' body(TagType.singleton), // scraped as singleton to get attached properties like 'class'
div(TagType.singleton), // scraped as singleton to get attached properties like 'id'
img(TagType.singleton), img(TagType.singleton),
base(TagType.singleton), base(TagType.singleton),
frame(TagType.singleton), frame(TagType.singleton),
@ -115,7 +115,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
i(TagType.pair), i(TagType.pair),
li(TagType.pair), li(TagType.pair),
script(TagType.pair), script(TagType.pair),
style(TagType.pair); span(TagType.pair),
div(TagType.pair);
public TagType type; public TagType type;
private TagName(final TagType type) { private TagName(final TagType type) {
@ -185,6 +186,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private double lon, lat; private double lon, lat;
private AnchorURL canonical, publisher; private AnchorURL canonical, publisher;
private final int maxLinks; private final int maxLinks;
private final VocabularyScraper vocabularyScraper;
private int breadcrumbs; private int breadcrumbs;
@ -203,14 +205,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/ */
private final Evaluation evaluationScores; private final Evaluation evaluationScores;
/**
* scrape a document
* @param root the document root url
* @param maxLinks the maximum number of links to scapre
* @param classDetector a map from class names to vocabulary names to scrape content from the DOM with associated class name
*/
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, int maxLinks) { public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper) {
// the root value here will not be used to load the resource. // the root value here will not be used to load the resource.
// it is only the reference for relative links // it is only the reference for relative links
super(linkTags0, linkTags1); super(linkTags0, linkTags1);
assert root != null; assert root != null;
this.root = root; this.root = root;
this.maxLinks = maxLinks; this.maxLinks = maxLinks;
this.vocabularyScraper = vocabularyScraper;
this.evaluationScores = new Evaluation(); this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks); this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
this.css = new SizeLimitedMap<DigestURL, String>(maxLinks); this.css = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -392,15 +401,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.frames.add(src); this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true)); this.evaluationScores.match(Element.framepath, src.toNormalform(true));
} else if (tag.name.equalsIgnoreCase("body")) { } else if (tag.name.equalsIgnoreCase("body")) {
final String c = tag.opts.getProperty("class", EMPTY_STRING); final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
this.evaluationScores.match(Element.bodyclass, c); this.evaluationScores.match(Element.bodyclass, classprop);
} else if (tag.name.equalsIgnoreCase("div")) {
final String id = tag.opts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
} else if (tag.name.equalsIgnoreCase("meta")) { } else if (tag.name.equalsIgnoreCase("meta")) {
final String content = tag.opts.getProperty("content", EMPTY_STRING); final String content = tag.opts.getProperty("content", EMPTY_STRING);
String name = tag.opts.getProperty("name", EMPTY_STRING); String name = tag.opts.getProperty("name", EMPTY_STRING);
@ -509,6 +511,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override @Override
public void scrapeTag1(Tag tag) { public void scrapeTag1(Tag tag) {
final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
//System.out.println("class = " + classprop);
this.vocabularyScraper.check(this.root, classprop, tag.content);
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
String href = tag.opts.getProperty("href", EMPTY_STRING); String href = tag.opts.getProperty("href", EMPTY_STRING);
@ -536,7 +541,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.evaluationScores.match(Element.apath, href); this.evaluationScores.match(Element.apath, href);
} }
final String h; final String h;
if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) { if (tag.name.equalsIgnoreCase("div")) {
final String id = tag.opts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
} else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[0].add(h); if (h.length() > 0) this.headlines[0].add(h);
} else if((tag.name.equalsIgnoreCase("h2")) && (tag.content.length() < 1024)) { } else if((tag.name.equalsIgnoreCase("h2")) && (tag.content.length() < 1024)) {
@ -601,7 +613,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// start a new scraper to parse links inside this text // start a new scraper to parse links inside this text
// parsing the content // parsing the content
final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks); final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false); final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
try { try {
FileUtils.copy(new CharArrayReader(inlineHtml), writer); FileUtils.copy(new CharArrayReader(inlineHtml), writer);
@ -1090,13 +1102,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (page == null) throw new IOException("no content in file " + file.toString()); if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset // scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new DigestURL("http://localhost"),null,false, maxLinks); final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close(); htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString(); if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content // scrape content
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks); final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper());
final Writer writer = new TransformerWriter(null, null, scraper, null, false); final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close(); writer.close();

@ -37,6 +37,7 @@ import java.util.Properties;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.document.VocabularyScraper;
public class ScraperInputStream extends InputStream implements ScraperListener { public class ScraperInputStream extends InputStream implements ScraperListener {
@ -59,6 +60,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
public ScraperInputStream( public ScraperInputStream(
final InputStream inStream, final InputStream inStream,
final String inputStreamCharset, final String inputStreamCharset,
final VocabularyScraper vocabularyScraper,
final DigestURL rooturl, final DigestURL rooturl,
final Transformer transformer, final Transformer transformer,
final boolean passbyIfBinarySuspect, final boolean passbyIfBinarySuspect,
@ -68,7 +70,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize); this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
this.bufferedIn.mark((int) preBufferSize); this.bufferedIn.mark((int) preBufferSize);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks); final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper);
scraper.registerHtmlFilterEventListener(this); scraper.registerHtmlFilterEventListener(this);
try { try {

@ -45,6 +45,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.document.parser.html.ScraperInputStream;
@ -86,13 +87,13 @@ public class htmlParser extends AbstractParser implements Parser {
public Document[] parse( public Document[] parse(
final AnchorURL location, final AnchorURL location,
final String mimeType, final String mimeType,
final String documentCharset, final String documentCharset, final VocabularyScraper vocscraper,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure, InterruptedException {
try { try {
// first get a document from the parsed html // first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
final ContentScraper scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks); final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, sourceStream, maxLinks);
// parseToScraper also detects/corrects/sets charset from html content tag // parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
@ -150,7 +151,7 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd; return ppd;
} }
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, String input, int maxLinks) throws IOException { public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, String input, int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream; InputStream sourceStream;
try { try {
@ -160,7 +161,7 @@ public class htmlParser extends AbstractParser implements Parser {
} }
ContentScraper scraper; ContentScraper scraper;
try { try {
scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks); scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, sourceStream, maxLinks);
} catch (Failure e) { } catch (Failure e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} }
@ -170,6 +171,7 @@ public class htmlParser extends AbstractParser implements Parser {
public static ContentScraper parseToScraper( public static ContentScraper parseToScraper(
final DigestURL location, final DigestURL location,
final String documentCharset, final String documentCharset,
final VocabularyScraper vocabularyScraper,
Charset[] detectedcharsetcontainer, Charset[] detectedcharsetcontainer,
InputStream sourceStream, InputStream sourceStream,
final int maxLinks) throws Parser.Failure, IOException { final int maxLinks) throws Parser.Failure, IOException {
@ -186,7 +188,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (charset == null) { if (charset == null) {
ScraperInputStream htmlFilter = null; ScraperInputStream htmlFilter = null;
try { try {
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks); htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks);
sourceStream = htmlFilter; sourceStream = htmlFilter;
charset = htmlFilter.detectCharset(); charset = htmlFilter.detectCharset();
} catch (final IOException e1) { } catch (final IOException e1) {
@ -220,7 +222,7 @@ public class htmlParser extends AbstractParser implements Parser {
} }
// parsing the content // parsing the content
final ContentScraper scraper = new ContentScraper(location, maxLinks); final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try { try {
FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]); FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
@ -322,7 +324,7 @@ public class htmlParser extends AbstractParser implements Parser {
try { try {
url = new AnchorURL(args[0]); url = new AnchorURL(args[0]);
final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null); final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null);
final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new ByteArrayInputStream(content)); final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), new ByteArrayInputStream(content));
final String title = document[0].dc_title(); final String title = document[0].dc_title();
System.out.println(title); System.out.println(title);
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {

@ -53,6 +53,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.images.bmpParser.IMAGEMAP; import net.yacy.document.parser.images.bmpParser.IMAGEMAP;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -92,7 +93,7 @@ public class genericImageParser extends AbstractParser implements Parser {
public Document[] parse( public Document[] parse(
final AnchorURL location, final AnchorURL location,
final String mimeType, final String mimeType,
final String documentCharset, final String documentCharset, final VocabularyScraper scraper,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure, InterruptedException {
ImageInfo ii = null; ImageInfo ii = null;
@ -314,7 +315,7 @@ public class genericImageParser extends AbstractParser implements Parser {
AnchorURL uri; AnchorURL uri;
try { try {
uri = new AnchorURL("http://localhost/" + image.getName()); uri = new AnchorURL("http://localhost/" + image.getName());
final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new FileInputStream(image)); final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), new FileInputStream(image));
System.out.println(document[0].toString()); System.out.println(document[0].toString());
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {
e.printStackTrace(); e.printStackTrace();

@ -33,6 +33,7 @@ import com.drew.metadata.Directory;
import com.drew.metadata.Metadata; import com.drew.metadata.Metadata;
import com.drew.metadata.Tag; import com.drew.metadata.Tag;
import com.drew.metadata.exif.GpsDirectory; import com.drew.metadata.exif.GpsDirectory;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
@ -42,11 +43,13 @@ import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
/** /**
@ -84,7 +87,7 @@ public class metadataImageParser extends AbstractParser implements Parser {
public Document[] parse( public Document[] parse(
final AnchorURL location, final AnchorURL location,
final String mimeType, final String mimeType,
final String documentCharset, final String documentCharset, final VocabularyScraper scraper,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure, InterruptedException {
String title = null; String title = null;

@ -28,6 +28,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
/** /**
* This parser is used if we know that the content is text but the exact format is unknown. * This parser is used if we know that the content is text but the exact format is unknown.
@ -59,10 +60,10 @@ public class linkScraperParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, source); Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, source);
Document htmlParserDoc = htmlParserDocs == null ? null : Document.mergeDocuments(location, mimeType, htmlParserDocs); Document htmlParserDoc = htmlParserDocs == null ? null : Document.mergeDocuments(location, mimeType, htmlParserDocs);

@ -39,6 +39,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.xml.sax.Attributes; import org.xml.sax.Attributes;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
@ -71,7 +72,7 @@ public class mmParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException throws Parser.Failure, InterruptedException
{ {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();

@ -48,6 +48,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.xml.ODContentHandler; import net.yacy.document.parser.xml.ODContentHandler;
import net.yacy.document.parser.xml.ODMetaHandler; import net.yacy.document.parser.xml.ODMetaHandler;
import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.io.CharBuffer;
@ -215,7 +216,7 @@ public class odtParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null; File dest = null;
try { try {
// creating a tempfile // creating a tempfile

@ -48,6 +48,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.xml.ODContentHandler; import net.yacy.document.parser.xml.ODContentHandler;
import net.yacy.document.parser.xml.ODMetaHandler; import net.yacy.document.parser.xml.ODMetaHandler;
import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.io.CharBuffer;
@ -201,7 +202,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null; File dest = null;
try { try {
// creating a tempfile // creating a tempfile

@ -59,6 +59,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
@ -85,7 +86,7 @@ public class pdfParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser // check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false)) if (!MemoryControl.request(200 * 1024 * 1024, false))
@ -375,7 +376,7 @@ public class pdfParser extends AbstractParser implements Parser {
final AbstractParser parser = new pdfParser(); final AbstractParser parser = new pdfParser();
Document document = null; Document document = null;
try { try {
document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new FileInputStream(pdfFile))); document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), new FileInputStream(pdfFile)));
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
System.err.println("Cannot parse file " + pdfFile.getAbsolutePath()); System.err.println("Cannot parse file " + pdfFile.getAbsolutePath());
ConcurrentLog.logException(e); ConcurrentLog.logException(e);

@ -37,6 +37,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor;
@ -62,7 +63,7 @@ public class pptParser extends AbstractParser implements Parser {
*/ */
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) throws Parser.Failure, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure,
InterruptedException { InterruptedException {
try { try {
/* /*

@ -41,6 +41,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -258,7 +259,7 @@ public class psParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;

@ -34,6 +34,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
public class rdfParser extends AbstractParser implements Parser { public class rdfParser extends AbstractParser implements Parser {
@ -46,7 +47,7 @@ public class rdfParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Failure, InterruptedException { throws Failure, InterruptedException {

@ -23,6 +23,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.rdfa.IRDFaTriple; import net.yacy.document.parser.rdfa.IRDFaTriple;
@ -48,10 +49,10 @@ public class RDFaParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(AnchorURL url, String mimeType, public Document[] parse(AnchorURL url, String mimeType,
String charset, InputStream source) throws Failure, String charset, final VocabularyScraper scraper, InputStream source) throws Failure,
InterruptedException { InterruptedException {
Document[] htmlDocs = parseHtml(url, mimeType, charset, source); Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, source);
// TODO: current hardcoded restriction: apply rdfa parser only on selected sources. // TODO: current hardcoded restriction: apply rdfa parser only on selected sources.
@ -97,12 +98,12 @@ public class RDFaParser extends AbstractParser implements Parser {
} }
private Document[] parseHtml(AnchorURL url, String mimeType, private Document[] parseHtml(AnchorURL url, String mimeType,
String charset, InputStream source) throws Failure, String charset, VocabularyScraper scraper, InputStream source) throws Failure,
InterruptedException { InterruptedException {
Document[] htmlDocs = null; Document[] htmlDocs = null;
try { try {
htmlDocs = this.hp.parse(url, mimeType, charset, source); htmlDocs = this.hp.parse(url, mimeType, charset, scraper, source);
source.reset(); source.reset();
} catch (final IOException e1) { } catch (final IOException e1) {
@ -179,7 +180,7 @@ public class RDFaParser extends AbstractParser implements Parser {
if (aReader != null) { if (aReader != null) {
RDFaParser aParser = new RDFaParser(); RDFaParser aParser = new RDFaParser();
try { try {
aParser.parse(new AnchorURL(args[0]),"","",aURL.openStream()); aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), aURL.openStream());
} catch (final FileNotFoundException e) { } catch (final FileNotFoundException e) {
e.printStackTrace(); e.printStackTrace();
} catch (final IOException e) { } catch (final IOException e) {

@ -43,6 +43,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
public class rssParser extends AbstractParser implements Parser { public class rssParser extends AbstractParser implements Parser {
@ -59,7 +60,7 @@ public class rssParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Failure, InterruptedException { throws Failure, InterruptedException {
RSSReader rssReader; RSSReader rssReader;
try { try {

@ -37,6 +37,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
public class rtfParser extends AbstractParser implements Parser { public class rtfParser extends AbstractParser implements Parser {
@ -53,7 +54,7 @@ public class rtfParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
try { try {

@ -40,6 +40,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import SevenZip.ArchiveExtractCallback; import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream; import SevenZip.IInStream;
@ -105,7 +106,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
try { try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos); FileUtils.copy(source, cfos);
@ -171,7 +172,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// below for reversion of the effects // below for reversion of the effects
final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath); final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.doc.getDepth() + 1, this.cfos.toByteArray()); theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), this.doc.getDepth() + 1, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs); this.doc.addSubDocuments(theDocs);
} }

@ -35,6 +35,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
// this is a new implementation of this parser idiom using multiple documents as result set // this is a new implementation of this parser idiom using multiple documents as result set
@ -58,7 +59,7 @@ public class sidAudioParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
try { try {
final int available = source.available(); final int available = source.available();

@ -51,6 +51,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.io.ByteCountInputStream; import net.yacy.kelondro.io.ByteCountInputStream;
@ -70,7 +71,7 @@ public class sitemapParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Failure, InterruptedException { throws Failure, InterruptedException {
final List<Document> docs = new ArrayList<Document>(); final List<Document> docs = new ArrayList<Document>();
SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent); SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent);

@ -37,6 +37,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import pt.tumba.parser.swf.SWF2HTML; import pt.tumba.parser.swf.SWF2HTML;
public class swfParser extends AbstractParser implements Parser { public class swfParser extends AbstractParser implements Parser {
@ -56,7 +57,7 @@ public class swfParser extends AbstractParser implements Parser {
*/ */
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException throws Parser.Failure, InterruptedException
{ {

@ -40,6 +40,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import org.apache.tools.tar.TarEntry; import org.apache.tools.tar.TarEntry;
@ -61,7 +62,7 @@ public class tarParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException {
final List<Document> docacc = new ArrayList<Document>(); final List<Document> docacc = new ArrayList<Document>();
Document[] subDocs = null; Document[] subDocs = null;
@ -90,7 +91,7 @@ public class tarParser extends AbstractParser implements Parser {
try { try {
tmp = FileUtils.createTempFile(this.getClass(), name); tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize()); FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, 999, tmp); subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, scraper, 999, tmp);
if (subDocs == null) continue; if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d); for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {

@ -40,6 +40,7 @@ import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.util.BDecoder; import net.yacy.kelondro.util.BDecoder;
import net.yacy.kelondro.util.BDecoder.BObject; import net.yacy.kelondro.util.BDecoder.BObject;
@ -56,7 +57,7 @@ public class torrentParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(AnchorURL location, String mimeType, String charset, InputStream source) public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
byte[] b = null; byte[] b = null;
try { try {
@ -119,8 +120,8 @@ public class torrentParser extends AbstractParser implements Parser {
try { try {
byte[] b = FileUtils.read(new File(args[0])); byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser(); torrentParser parser = new torrentParser();
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b)); Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false, false); Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false);
Map<String, Word> w = c.words(); Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (final IOException e) { } catch (final IOException e) {

@ -46,6 +46,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
/** /**
* Vcard specification: http://www.imc.org/pdi/vcard-21.txt * Vcard specification: http://www.imc.org/pdi/vcard-21.txt
@ -65,7 +66,7 @@ public class vcfParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final InputStream source) public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
try { try {

@ -37,6 +37,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.hpsf.SummaryInformation;
@ -66,7 +67,7 @@ public class vsdParser extends AbstractParser implements Parser {
* all extracted information about the parsed document * all extracted information about the parsed document
*/ */
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
Document theDoc = null; Document theDoc = null;

@ -36,6 +36,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFListener;
@ -68,7 +69,7 @@ public class xlsParser extends AbstractParser implements Parser {
*/ */
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) throws Parser.Failure, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure,
InterruptedException { InterruptedException {
return new XLSHSSFListener().parse(location, mimeType, charset, source); return new XLSHSSFListener().parse(location, mimeType, charset, source);
} }

@ -38,6 +38,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
@ -62,7 +63,7 @@ public class zipParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source) final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
// check memory for parser // check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false)) if (!MemoryControl.request(200 * 1024 * 1024, false))
@ -89,7 +90,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize()); FileUtils.copy(zis, tmp, entry.getSize());
final DigestURL virtualURL = DigestURL.newURL(url, "#" + name); final DigestURL virtualURL = DigestURL.newURL(url, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, 999, tmp); docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, 999, tmp);
if (docs == null) continue; if (docs == null) continue;
for (final Document d: docs) docacc.add(d); for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {

@ -1045,7 +1045,6 @@ public class YaCyDefaultServlet extends HttpServlet {
upload.setFileSizeMax(SIZE_FILE_THRESHOLD); upload.setFileSizeMax(SIZE_FILE_THRESHOLD);
try { try {
// Parse the request to get form field items // Parse the request to get form field items
@SuppressWarnings("unchecked")
List<FileItem> fileItems = upload.parseRequest(request); List<FileItem> fileItems = upload.parseRequest(request);
// Process the uploaded file items // Process the uploaded file items
Iterator<FileItem> i = fileItems.iterator(); Iterator<FileItem> i = fileItems.iterator();

@ -418,7 +418,7 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.mime()); final String supportError = TextParser.supports(url, responseHeader.mime());
if (supportError != null) throw new IOException("no parser support: " + supportError); if (supportError != null) throw new IOException("no parser support: " + supportError);
try { try {
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.depth(), response.getContent()); documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), response.depth(), response.getContent());
if (documents == null) throw new IOException("document == null"); if (documents == null) throw new IOException("document == null");
} catch (final Exception e) { } catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage()); throw new IOException("parser error: " + e.getMessage());

@ -2570,6 +2570,7 @@ public final class Switchboard extends serverSwitch {
new AnchorURL(response.url()), new AnchorURL(response.url()),
response.getMimeType(), response.getMimeType(),
response.getCharacterEncoding(), response.getCharacterEncoding(),
response.profile().scraper(),
response.depth(), response.depth(),
response.getContent()); response.getContent());
if ( documents == null ) { if ( documents == null ) {
@ -2750,7 +2751,7 @@ public final class Switchboard extends serverSwitch {
for ( int i = 0; i < in.documents.length; i++ ) { for ( int i = 0; i < in.documents.length; i++ ) {
condenser[i] = condenser[i] =
new Condenser( new Condenser(
in.documents[i], in.queueEntry.profile().indexText(), in.documents[i], in.queueEntry.profile().scraper(), in.queueEntry.profile().indexText(),
in.queueEntry.profile().indexMedia(), in.queueEntry.profile().indexMedia(),
LibraryProvider.dymLib, true, this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt)); LibraryProvider.dymLib, true, this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt));
@ -3189,7 +3190,7 @@ public final class Switchboard extends serverSwitch {
throw new Parser.Failure("indexing is denied", url); throw new Parser.Failure("indexing is denied", url);
} }
final Condenser condenser = new Condenser( final Condenser condenser = new Condenser(
document, true, true, LibraryProvider.dymLib, true, document, null, true, true, LibraryProvider.dymLib, true,
Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt)); Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt));
ResultImages.registerImages(url, document, true); ResultImages.registerImages(url, document, true);
Switchboard.this.webStructure.generateCitationReference(url, document); Switchboard.this.webStructure.generateCitationReference(url, document);

@ -42,6 +42,7 @@ import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.WebgraphConfiguration; import net.yacy.search.schema.WebgraphConfiguration;
@ -149,7 +150,7 @@ public class DocumentIndex extends Segment {
length = -1; length = -1;
} }
try { try {
documents = TextParser.parseSource(url, null, null, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null));
} catch (final Exception e ) { } catch (final Exception e ) {
throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage()); throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
} }
@ -158,7 +159,7 @@ public class DocumentIndex extends Segment {
int c = 0; int c = 0;
for ( final Document document : documents ) { for ( final Document document : documents ) {
if (document == null) continue; if (document == null) continue;
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true, true); final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true);
rows[c++] = rows[c++] =
super.storeDocument( super.storeDocument(
url, url,

@ -761,7 +761,7 @@ public class Segment {
} }
// get the word set // get the word set
Set<String> words = null; Set<String> words = null;
words = new Condenser(document, true, true, null, false, false).words().keySet(); words = new Condenser(document, null, true, true, null, false, false).words().keySet();
// delete all word references // delete all word references
int count = 0; int count = 0;

Loading…
Cancel
Save