given css class and extends a given vocabulary with a term consisting with the text content of the html class tag. Additionally, the term is included into the semantic facet of the document. This allows the creation of faceted search to documents without the pre-creation of vocabularies; instead, the vocabulary is created on-the-fly, possibly for use in other crawls. If any of the term scraping for a specific vocabulary is successful on a document, this vocabulary is excluded for auto-annotation on the page. To use this feature, do the following: - create a vocabulary on /Vocabulary_p.html (if not existent) - in /CrawlStartExpert.html you will now see the vocabularies as column in a table. The second column provides text fields where you can name the class of html entities where the literal of the corresponding vocabulary shall be scraped out - when doing a search, you will see the content of the scraped fields in a navigation facet for the given vocabularypull/1/head
parent
1cb290170e
commit
b5ac29c9a5
@ -0,0 +1,90 @@
|
||||
/**
|
||||
* VocabularyScraper
|
||||
* Copyright 2015 by Michael Peter Christen
|
||||
* First released 30.01.2015 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.util.JSONException;
|
||||
import net.yacy.cora.util.JSONObject;
|
||||
import net.yacy.kelondro.io.CharBuffer;
|
||||
|
||||
public class VocabularyScraper {
|
||||
|
||||
private final JSONObject scraperDefinition;
|
||||
private Map<String, String> classVocabulary; // a mapping from class names to the vocabulary where this class should be mapped
|
||||
private final Map<DigestURL, ConcurrentHashMap<String, String>> vocMap; // a mapping from a document to a map from vocabularies to terms
|
||||
|
||||
public VocabularyScraper() {
|
||||
this.classVocabulary = null;
|
||||
this.scraperDefinition = new JSONObject();
|
||||
this.vocMap = new ConcurrentHashMap<>();
|
||||
}
|
||||
|
||||
public VocabularyScraper(JSONObject init) {
|
||||
// init must be a property list of property lists: the key of the top property list is the name of the vocabulary, the name of the embedded property list is the entity class and the value of the embedded property is the entity name
|
||||
this.scraperDefinition = init == null ? new JSONObject() : init;
|
||||
this.vocMap = new ConcurrentHashMap<>();
|
||||
if (this.scraperDefinition.length() == 0) {
|
||||
this.classVocabulary = null;
|
||||
} else {
|
||||
this.classVocabulary = new ConcurrentHashMap<>();
|
||||
for (String voc: this.scraperDefinition.keySet()) {
|
||||
JSONObject props = this.scraperDefinition.getJSONObject(voc);
|
||||
try {
|
||||
String classtype = props.getString("class");
|
||||
this.classVocabulary.put(classtype, voc);
|
||||
} catch (JSONException e) {}
|
||||
}
|
||||
if (this.classVocabulary.size() == 0) this.classVocabulary = null;
|
||||
}
|
||||
}
|
||||
|
||||
public VocabularyScraper(String init) {
|
||||
this(new JSONObject(init));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return this.scraperDefinition.toString();
|
||||
}
|
||||
|
||||
public void check(DigestURL root, String className, CharBuffer content) {
|
||||
if (this.classVocabulary == null) return;
|
||||
String voc = this.classVocabulary.get(className);
|
||||
if (voc == null) return;
|
||||
// record the mapping
|
||||
ConcurrentHashMap<String, String> vocmap = this.vocMap.get(root);
|
||||
if (vocmap == null) {
|
||||
synchronized (this) {
|
||||
vocmap = new ConcurrentHashMap<>();
|
||||
this.vocMap.put(root, vocmap);
|
||||
}
|
||||
}
|
||||
if (!vocmap.containsKey(voc)) vocmap.put(voc, content.toString()); // we put only the first occurrence of the entity into the vocmap
|
||||
}
|
||||
|
||||
public Map<String, String> removeVocMap(DigestURL root) {
|
||||
return this.vocMap.remove(root);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue