enhanced snippet fetch - removed a bug that caused documents to be

parsed even if a solr text was available
pull/1/head
Michael Peter Christen 13 years ago
parent 18f989dfb1
commit 9bece5ac5f

@ -99,6 +99,10 @@ public class URIMetadataNode implements URIMetadata {
this.ranking = ranking; this.ranking = ranking;
} }
public SolrDocument getDocument() {
return this.doc;
}
private int getInt(YaCySchema field) { private int getInt(YaCySchema field) {
Integer x = (Integer) this.doc.getFieldValue(field.name()); Integer x = (Integer) this.doc.getFieldValue(field.name());
if (x == null) return 0; if (x == null) return 0;

@ -35,7 +35,6 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification; import net.yacy.cora.document.Classification;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
@ -45,6 +44,7 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -54,6 +54,7 @@ import net.yacy.peers.graphics.ProfilingGraph;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.EventTracker; import net.yacy.search.EventTracker;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.MetadataRepository;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
import net.yacy.search.snippet.MediaSnippet; import net.yacy.search.snippet.MediaSnippet;
import net.yacy.search.snippet.ResultEntry; import net.yacy.search.snippet.ResultEntry;
@ -439,7 +440,7 @@ public class SnippetProcess {
private final CacheStrategy cacheStrategy; private final CacheStrategy cacheStrategy;
private final int neededResults; private final int neededResults;
private boolean shallrun; private boolean shallrun;
private final SolrConnector solr; private final MetadataRepository metadata;
public Worker(final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) { public Worker(final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) {
this.cacheStrategy = cacheStrategy; this.cacheStrategy = cacheStrategy;
@ -447,7 +448,7 @@ public class SnippetProcess {
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime); this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
this.neededResults = neededResults; this.neededResults = neededResults;
this.shallrun = true; this.shallrun = true;
this.solr = SnippetProcess.this.rankingProcess.getQuery().getSegment().getSolr(); this.metadata = SnippetProcess.this.rankingProcess.getQuery().getSegment().urlMetadata();
} }
@Override @Override
@ -496,16 +497,18 @@ public class SnippetProcess {
// in case that we have an attached solr, we load also the solr document // in case that we have an attached solr, we load also the solr document
String solrContent = null; String solrContent = null;
if (this.solr != null) { SolrDocument sd = null;
SolrDocument sd = null; if (page instanceof URIMetadataNode) {
sd = ((URIMetadataNode) page).getDocument();
} else {
try { try {
sd = this.solr.get(ASCII.String(page.hash())); sd = this.metadata.getSolr().get(ASCII.String(page.hash()));
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);
} }
if (sd != null) { }
solrContent = Switchboard.getSwitchboard().index.getSolrScheme().solrGetText(sd); if (sd != null) {
} solrContent = this.metadata.getSolrScheme().solrGetText(sd);
} }
resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0 resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0

@ -179,37 +179,24 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
{ //encapsulate potential expensive sentences { //encapsulate potential expensive sentences
Collection<StringBuilder> sentences = null; Collection<StringBuilder> sentences = null;
// try the solr text first
if (solrText != null) {
// compute sentences from solr query
final SentenceReader sr = new SentenceReader(solrText, pre);
sentences = new ArrayList<StringBuilder>();
while (sr.hasNext()) {
sentences.add(sr.next());
}
if (sentences != null) {
try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords();
} catch (final UnsupportedOperationException e) {
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
return;
}
}
}
// try to get the snippet from metadata // try to get the snippet from metadata
removeMatchingHashes(row.dc_title(), remainingHashes); removeMatchingHashes(row.dc_title(), remainingHashes);
removeMatchingHashes(row.dc_creator(), remainingHashes); removeMatchingHashes(row.dc_creator(), remainingHashes);
removeMatchingHashes(row.dc_subject(), remainingHashes); removeMatchingHashes(row.dc_subject(), remainingHashes);
removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes); removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes);
if (remainingHashes.isEmpty()) { if (!remainingHashes.isEmpty()) {
// the snippet is fully inside the metadata! // we did not find everything in the metadata, look further into the document itself.
if (de.anomic.crawler.Cache.has(url.hash())) { // first acquire the sentences:
if (solrText != null) {
// compute sentences from solr query
final SentenceReader sr = new SentenceReader(solrText, pre);
sentences = new ArrayList<StringBuilder>();
while (sr.hasNext()) {
sentences.add(sr.next());
}
} else if (de.anomic.crawler.Cache.has(url.hash())) {
// get the sentences from the cache // get the sentences from the cache
final Request request = loader == null ? null : loader.request(url, true, reindexing); final Request request = loader == null ? null : loader.request(url, true, reindexing);
Response response; Response response;
@ -227,19 +214,41 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
} }
} }
} }
if (sentences == null) { if (sentences == null) {
// not found the snippet
init(url.hash(), null, ResultClass.SOURCE_METADATA, null); init(url.hash(), null, ResultClass.SOURCE_METADATA, null);
return; return;
} }
// use the first lines from the text as snippet
final StringBuilder s = new StringBuilder(snippetMaxLength); if (sentences.size() > 0) {
for (final StringBuilder t: sentences) { try {
s.append(t).append(' '); final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
if (s.length() >= snippetMaxLength / 4 * 3) break; textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords();
} catch (final UnsupportedOperationException e) {
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
return;
}
}
}
if (remainingHashes.isEmpty()) {
// we found the snippet
if (textline == null) {
if (sentences == null) {
textline = row.dc_subject();
} else {
// use the first lines from the text as snippet
final StringBuilder s = new StringBuilder(snippetMaxLength);
for (final StringBuilder t: sentences) {
s.append(t).append(' ');
if (s.length() >= snippetMaxLength / 4 * 3) break;
}
if (s.length() > snippetMaxLength) { s.setLength(snippetMaxLength); s.trimToSize(); }
textline = s.toString();
}
} }
if (s.length() > snippetMaxLength) { s.setLength(snippetMaxLength); s.trimToSize(); } init(url.hash(), textline.length() > 0 ? textline : this.line, ResultClass.SOURCE_METADATA, null);
init(url.hash(), s.length() > 0 ? s.toString() : this.line, ResultClass.SOURCE_METADATA, null);
return; return;
} }
@ -313,7 +322,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
//if (videoline != null) line += (line.isEmpty()) ? videoline : "<br />" + videoline; //if (videoline != null) line += (line.isEmpty()) ? videoline : "<br />" + videoline;
//if (appline != null) line += (line.isEmpty()) ? appline : "<br />" + appline; //if (appline != null) line += (line.isEmpty()) ? appline : "<br />" + appline;
//if (hrefline != null) line += (line.isEmpty()) ? hrefline : "<br />" + hrefline; //if (hrefline != null) line += (line.isEmpty()) ? hrefline : "<br />" + hrefline;
if (textline != null) snippetLine += (snippetLine.isEmpty()) ? textline : "<br />" + textline; //if (textline != null) snippetLine += (snippetLine.isEmpty()) ? textline : "<br />" + textline;
if (snippetLine == null || !remainingHashes.isEmpty()) { if (snippetLine == null || !remainingHashes.isEmpty()) {
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found"); init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");

Loading…
Cancel
Save