snippet computation update

pull/1/head
orbiter 11 years ago
parent cf9e7fdbb8
commit 487021fb0a

@ -45,6 +45,7 @@ import net.yacy.cora.order.Base64Order;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.SentenceReader;
import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.Bitfield;
@ -211,6 +212,42 @@ public class URIMetadataNode extends SolrDocument {
return a.get(0); return a.get(0);
} }
public List<String> h1() {
ArrayList<String> a = getStringList(CollectionSchema.h1_txt);
if (a == null || a.size() == 0) return new ArrayList<String>(0);
return a;
}
public List<String> h2() {
ArrayList<String> a = getStringList(CollectionSchema.h2_txt);
if (a == null || a.size() == 0) return new ArrayList<String>(0);
return a;
}
public List<String> h3() {
ArrayList<String> a = getStringList(CollectionSchema.h3_txt);
if (a == null || a.size() == 0) return new ArrayList<String>(0);
return a;
}
public List<String> h4() {
ArrayList<String> a = getStringList(CollectionSchema.h4_txt);
if (a == null || a.size() == 0) return new ArrayList<String>(0);
return a;
}
public List<String> h5() {
ArrayList<String> a = getStringList(CollectionSchema.h5_txt);
if (a == null || a.size() == 0) return new ArrayList<String>(0);
return a;
}
public List<String> h6() {
ArrayList<String> a = getStringList(CollectionSchema.h6_txt);
if (a == null || a.size() == 0) return new ArrayList<String>(0);
return a;
}
public String dc_creator() { public String dc_creator() {
return getString(CollectionSchema.author); return getString(CollectionSchema.author);
} }
@ -418,6 +455,18 @@ public class URIMetadataNode extends SolrDocument {
return getString(CollectionSchema.text_t); return getString(CollectionSchema.text_t);
} }
public List<StringBuilder> getSentences(final boolean pre) {
List<StringBuilder> sentences = new ArrayList<>();
String text = this.getText();
if (text == null || text.length() == 0) return sentences;
SentenceReader sr = new SentenceReader(text, pre);
while (sr.hasNext()) sentences.add(sr.next());
sr.close();
sr = null;
text = null;
return sentences;
}
public ArrayList<String> getDescription() { public ArrayList<String> getDescription() {
return getStringList(CollectionSchema.description_txt); return getStringList(CollectionSchema.description_txt);
} }

@ -30,6 +30,7 @@ import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -178,75 +179,88 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// this requires that the document is parsed after loading // this requires that the document is parsed after loading
String textline = null; String textline = null;
HandleSet remainingHashes = queryhashes.clone(); HandleSet remainingHashes = queryhashes.clone();
{ //encapsulate potential expensive sentences List<StringBuilder> sentences = null;
Collection<StringBuilder> sentences = null;
// try to get the snippet from metadata
// try to get the snippet from metadata removeMatchingHashes(row.url().toTokens(), remainingHashes);
removeMatchingHashes(row.url().toTokens(), remainingHashes); removeMatchingHashes(row.dc_title(), remainingHashes);
removeMatchingHashes(row.dc_title(), remainingHashes); removeMatchingHashes(row.dc_creator(), remainingHashes);
removeMatchingHashes(row.dc_creator(), remainingHashes); removeMatchingHashes(row.dc_subject(), remainingHashes);
removeMatchingHashes(row.dc_subject(), remainingHashes);
if (!remainingHashes.isEmpty()) {
// we did not find everything in the metadata, look further into the document itself.
if (!remainingHashes.isEmpty()) { // first acquire the sentences:
// we did not find everything in the metadata, look further into the document itself. String solrText = row.getText();
if (solrText != null && solrText.length() > 0) {
// first acquire the sentences: // compute sentences from solr query
String solrText = row.getText(); sentences = row.getSentences(pre);
if (solrText != null) { } else if (net.yacy.crawler.data.Cache.has(url.hash())) {
// compute sentences from solr query // get the sentences from the cache
SentenceReader sr = new SentenceReader(solrText, pre); final Request request = loader == null ? null : loader.request(url, true, reindexing);
sentences = new ArrayList<StringBuilder>(); Response response;
while (sr.hasNext()) { try {
sentences.add(sr.next()); response = loader == null || request == null ? null : loader.load(request, CacheStrategy.CACHEONLY, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
} } catch (final IOException e1) {
sr.close(); response = null;
sr = null; }
solrText = null; Document document = null;
} else if (net.yacy.crawler.data.Cache.has(url.hash())) { if (response != null) {
// get the sentences from the cache
final Request request = loader == null ? null : loader.request(url, true, reindexing);
Response response;
try { try {
response = loader == null || request == null ? null : loader.load(request, CacheStrategy.CACHEONLY, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} catch (final IOException e1) { sentences = document.getSentences(pre);
response = null; response = null;
} document = null;
Document document = null; } catch (final Parser.Failure e) {
if (response != null) {
try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
sentences = document.getSentences(pre);
response = null;
document = null;
} catch (final Parser.Failure e) {
}
} }
} }
if (sentences == null) { }
// not found the snippet if (sentences == null) {
init(url.hash(), null, false, ResultClass.SOURCE_METADATA, null); // not found the snippet
init(url.hash(), null, false, ResultClass.SOURCE_METADATA, null);
return;
}
if (sentences.size() > 0) {
try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords();
} catch (final UnsupportedOperationException e) {
init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage());
return; return;
} }
}
}
if (sentences.size() > 0) { if (remainingHashes.isEmpty()) {
try { // we found the snippet or the query is fully included in the headline or url
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength); if (textline == null || textline.length() == 0) {
textline = tsr.getSnippet(); // this is the case where we don't have a snippet because all search words are included in the headline or the url
remainingHashes = tsr.getRemainingWords(); String solrText = row.getText();
} catch (final UnsupportedOperationException e) { if (solrText != null && solrText.length() > 0) {
init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage()); // compute sentences from solr query
return; sentences = row.getSentences(pre);
}
} }
} if (sentences == null || sentences.size() == 0) {
textline = row.dc_subject();
if (remainingHashes.isEmpty()) { } else {
// we found the snippet // use the first lines from the text after the h1 tag as snippet
if (textline == null) { // get first the h1 tag
if (sentences == null) { List<String> h1 = row.h1();
textline = row.dc_subject(); if (h1 != null && h1.size() > 0 && sentences.size() > 2) {
} else { // find first appearance of first h1 in sencences and then take the next sentence
// use the first lines from the text as snippet String h1s = h1.get(0);
if (h1s.length() > 0) {
solrsearch: for (int i = 0; i < sentences.size() - 2; i++) {
if (sentences.get(i).toString().startsWith(h1s)) {
textline = sentences.get(i + 1).toString();
break solrsearch;
}
}
}
}
if (textline == null) {
final StringBuilder s = new StringBuilder(snippetMaxLength); final StringBuilder s = new StringBuilder(snippetMaxLength);
for (final StringBuilder t: sentences) { for (final StringBuilder t: sentences) {
s.append(t).append(' '); s.append(t).append(' ');
@ -256,69 +270,69 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
textline = s.toString(); textline = s.toString();
} }
} }
init(url.hash(), textline.length() > 0 ? textline : this.line, false, ResultClass.SOURCE_METADATA, null);
return;
}
sentences = null; // we don't need this here any more
// try to load the resource from the cache
Response response = null;
try {
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), (url.isFile() || url.isSMB()) ? CacheStrategy.NOCACHE : (cacheStrategy == null ? CacheStrategy.CACHEONLY : cacheStrategy), BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
} catch (final IOException e) {
response = null;
} }
init(url.hash(), textline.length() > 0 ? textline : this.line, false, ResultClass.SOURCE_METADATA, null);
return;
}
sentences = null; // we don't need this here any more
// try to load the resource from the cache
Response response = null;
try {
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), (url.isFile() || url.isSMB()) ? CacheStrategy.NOCACHE : (cacheStrategy == null ? CacheStrategy.CACHEONLY : cacheStrategy), BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
} catch (final IOException e) {
response = null;
}
if (response == null) { if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online // in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy == null || cacheStrategy.mustBeOffline()) { if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
init(url.hash(), null, false, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry"); init(url.hash(), null, false, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
return;
}
// if it is still not available, report an error
init(url.hash(), null, false, ResultClass.ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
return; return;
} }
if (!response.fromCache()) { // if it is still not available, report an error
// place entry on indexing queue init(url.hash(), null, false, ResultClass.ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
Switchboard.getSwitchboard().toIndexer(response); return;
this.resultStatus = ResultClass.SOURCE_WEB; }
}
// parse the document to get all sentenced; available for snippet computation if (!response.fromCache()) {
Document document = null; // place entry on indexing queue
try { Switchboard.getSwitchboard().toIndexer(response);
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); this.resultStatus = ResultClass.SOURCE_WEB;
} catch (final Parser.Failure e) { }
init(url.hash(), null, false, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
return;
}
if (document == null) {
init(url.hash(), null, false, ResultClass.ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
return;
}
// compute sentences from parsed document // parse the document to get all sentenced; available for snippet computation
sentences = document.getSentences(pre); Document document = null;
document.close(); try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} catch (final Parser.Failure e) {
init(url.hash(), null, false, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
return;
}
if (document == null) {
init(url.hash(), null, false, ResultClass.ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
return;
}
if (sentences == null) { // compute sentences from parsed document
init(url.hash(), null, false, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences"); sentences = document.getSentences(pre);
return; document.close();
}
try { if (sentences == null) {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength); init(url.hash(), null, false, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
textline = tsr.getSnippet(); return;
remainingHashes = tsr.getRemainingWords(); }
} catch (final UnsupportedOperationException e) {
init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage()); try {
return; final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
} textline = tsr.getSnippet();
sentences = null; remainingHashes = tsr.getRemainingWords();
} //encapsulate potential expensive sentences END } catch (final UnsupportedOperationException e) {
init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage());
return;
}
sentences = null;
if (textline == null || !remainingHashes.isEmpty()) { if (textline == null || !remainingHashes.isEmpty()) {
init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "no matching snippet found"); init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "no matching snippet found");

Loading…
Cancel
Save