refactor ResultEntry to be based on MetadataNode/SolrDocument

to share/reuse common access routines
pull/8/head
reger 10 years ago
parent d882991bc5
commit 3d53da8236

@ -208,7 +208,7 @@ public class yacysearchitem {
prop.put("content_showVocabulary", sb.getConfigBool("search.result.show.vocabulary", true) ? 1 : 0); prop.put("content_showVocabulary", sb.getConfigBool("search.result.show.vocabulary", true) ? 1 : 0);
if (showEvent) prop.put("content_showEvent_date", GenericFormatter.RFC1123_SHORT_FORMATTER.format(events[0])); if (showEvent) prop.put("content_showEvent_date", GenericFormatter.RFC1123_SHORT_FORMATTER.format(events[0]));
prop.put("content_showDate_date", GenericFormatter.RFC1123_SHORT_FORMATTER.format(result.modified())); prop.put("content_showDate_date", GenericFormatter.RFC1123_SHORT_FORMATTER.format(result.moddate()));
prop.putHTML("content_showSize_sizename", RSSMessage.sizename(result.filesize())); prop.putHTML("content_showSize_sizename", RSSMessage.sizename(result.filesize()));
prop.put("content_showMetadata_urlhash", urlhash); prop.put("content_showMetadata_urlhash", urlhash);
prop.put("content_showParser_urlhash", urlhash); prop.put("content_showParser_urlhash", urlhash);
@ -218,7 +218,7 @@ public class yacysearchitem {
prop.put("content_showProxy_link", resultUrlstring); prop.put("content_showProxy_link", resultUrlstring);
prop.put("content_showHostBrowser_link", resultUrlstring); prop.put("content_showHostBrowser_link", resultUrlstring);
if (sb.getConfigBool("search.result.show.vocabulary", true)) { if (sb.getConfigBool("search.result.show.vocabulary", true)) {
URIMetadataNode node = result.getNode(); URIMetadataNode node = result;
int c = 0; int c = 0;
for (Map.Entry<String, Object> entry: node.entrySet()) { for (Map.Entry<String, Object> entry: node.entrySet()) {
String key = entry.getKey(); String key = entry.getKey();
@ -239,7 +239,7 @@ public class yacysearchitem {
} }
prop.put("content_urlhexhash", Seed.b64Hash2hexHash(urlhash)); prop.put("content_urlhexhash", Seed.b64Hash2hexHash(urlhash));
prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), MAX_URL_LENGTH)); prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), MAX_URL_LENGTH));
prop.put("content_date822", isAtomFeed ? ISO8601Formatter.FORMATTER.format(result.modified()) : HeaderFramework.formatRFC1123(result.modified())); prop.put("content_date822", isAtomFeed ? ISO8601Formatter.FORMATTER.format(result.moddate()) : HeaderFramework.formatRFC1123(result.moddate()));
if (showEvent) prop.put("content_showEvent_date822", isAtomFeed ? ISO8601Formatter.FORMATTER.format(events[0]) : HeaderFramework.formatRFC1123(events[0])); if (showEvent) prop.put("content_showEvent_date822", isAtomFeed ? ISO8601Formatter.FORMATTER.format(events[0]) : HeaderFramework.formatRFC1123(events[0]));
//prop.put("content_ybr", RankingProcess.ybr(result.hash())); //prop.put("content_ybr", RankingProcess.ybr(result.hash()));
prop.putHTML("content_size", Integer.toString(result.filesize())); // we don't use putNUM here because that number shall be usable as sorting key. To print the size, use 'sizename' prop.putHTML("content_size", Integer.toString(result.filesize())); // we don't use putNUM here because that number shall be usable as sorting key. To print the size, use 'sizename'
@ -248,9 +248,9 @@ public class yacysearchitem {
prop.putXML("content_file", resultFileName); // putXML for rss prop.putXML("content_file", resultFileName); // putXML for rss
prop.putXML("content_path", resultURL.getPath()); // putXML for rss prop.putXML("content_path", resultURL.getPath()); // putXML for rss
prop.put("content_nl", (item == theSearch.query.offset) ? 0 : 1); prop.put("content_nl", (item == theSearch.query.offset) ? 0 : 1);
prop.putHTML("content_publisher", result.publisher()); prop.putHTML("content_publisher", result.dc_publisher());
prop.putHTML("content_creator", result.creator());// author prop.putHTML("content_creator", result.dc_creator());// author
prop.putHTML("content_subject", result.subject()); prop.putHTML("content_subject", result.dc_subject());
final Iterator<String> query = theSearch.query.getQueryGoal().getIncludeStrings(); final Iterator<String> query = theSearch.query.getQueryGoal().getIncludeStrings();
final StringBuilder s = new StringBuilder(theSearch.query.getQueryGoal().getIncludeSize() * 20); final StringBuilder s = new StringBuilder(theSearch.query.getQueryGoal().getIncludeSize() * 20);
while (query.hasNext()) s.append('+').append(query.next()); while (query.hasNext()) s.append('+').append(query.next());
@ -263,7 +263,7 @@ public class yacysearchitem {
prop.put("content_description", desc); prop.put("content_description", desc);
prop.putXML("content_description-xml", desc); prop.putXML("content_description-xml", desc);
prop.putJSON("content_description-json", desc); prop.putJSON("content_description-json", desc);
prop.put("content_mimetype",result.getNode().mime()); // for atom <link> type attribute prop.put("content_mimetype", result.mime()); // for atom <link> type attribute
final HeuristicResult heuristic = theSearch.getHeuristic(result.hash()); final HeuristicResult heuristic = theSearch.getHeuristic(result.hash());
if (heuristic == null) { if (heuristic == null) {
prop.put("content_heuristic", 0); prop.put("content_heuristic", 0);

@ -1570,25 +1570,22 @@ public final class SearchEvent {
public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException { public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
if (item < imageViewed.size()) return nthImage(item); if (item < imageViewed.size()) return nthImage(item);
if (imageSpareGood.size() > 0) return nextSpare(); // first put out all good spare, but no bad spare if (imageSpareGood.size() > 0) return nextSpare(); // first put out all good spare, but no bad spare
ResultEntry ms = oneResult(imagePageCounter++, timeout); // we must use a different counter here because the image counter can be higher when one page filled up several spare ResultEntry doc = oneResult(imagePageCounter++, timeout); // we must use a different counter here because the image counter can be higher when one page filled up several spare
// check if the match was made in the url or in the image links // check if the match was made in the url or in the image links
if (ms == null) { if (doc == null) {
if (hasSpare()) return nextSpare(); if (hasSpare()) return nextSpare();
throw new MalformedURLException("no image url found"); throw new MalformedURLException("no image url found");
} }
// try to get more // try to get more
SolrDocument doc = ms.getNode();
// there can be two different kinds of image hits: either the document itself is an image or images are embedded in the links of text documents. // there can be two different kinds of image hits: either the document itself is an image or images are embedded in the links of text documents.
String mime = (String) doc.getFirstValue(CollectionSchema.content_type.getSolrFieldName()); String mime = (String) doc.getFirstValue(CollectionSchema.content_type.getSolrFieldName());
// boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that // boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that
// generalize above hack (regarding url with file extension but beeing a html (with html mime) // generalize above hack (regarding url with file extension but beeing a html (with html mime)
char docType = Response.docType(mime); // first look at mime (as some html pages have img extension (like wikipedia) if (doc.doctype() == Response.DT_IMAGE) {
if (docType == Response.DT_UNKNOWN) docType = Response.docType(ms.url()); // try extension if mime wasn't successful String id = ASCII.String(doc.hash());
if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(doc.url(), doc.url(), "", doc.title(), 0, 0, 0));
if (docType == Response.DT_IMAGE) {
String id = ASCII.String(ms.hash());
if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
} else { } else {
Collection<Object> altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); Collection<Object> altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
Collection<Object> imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); Collection<Object> imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
@ -1612,7 +1609,7 @@ public final class SearchEvent {
boolean sizeok = h != null && w != null && h.intValue() > 16 && w.intValue() > 16; boolean sizeok = h != null && w != null && h.intValue() > 16 && w.intValue() > 16;
String id = ASCII.String(imageUrl.hash()); String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !containsSpare(id)) { if (!imageViewed.containsKey(id) && !containsSpare(id)) {
ImageResult imageResult = new ImageResult(ms.url(), imageUrl, "", image_alt, w == null ? 0 : w, h == null ? 0 : h, 0); ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w == null ? 0 : w, h == null ? 0 : h, 0);
if (match || sizeok) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult); if (match || sizeok) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult);
} }
} catch (MalformedURLException e) { } catch (MalformedURLException e) {

@ -30,7 +30,6 @@ import java.io.IOException;
import java.util.Comparator; import java.util.Comparator;
import java.util.Date; import java.util.Date;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.ByteArray;
@ -39,21 +38,15 @@ import net.yacy.document.Condenser;
import net.yacy.document.parser.pdfParser; import net.yacy.document.parser.pdfParser;
import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.peers.Seed; import net.yacy.peers.Seed;
import net.yacy.peers.SeedDB; import net.yacy.peers.SeedDB;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEntry> { public class ResultEntry extends URIMetadataNode implements Comparable<ResultEntry>, Comparator<ResultEntry> {
// payload objects // payload objects
private final URIMetadataNode urlentry;
private String alternative_urlstring; private String alternative_urlstring;
private String alternative_urlname; private String alternative_urlname;
private final TextSnippet textSnippet; private final TextSnippet textSnippet;
@ -63,8 +56,8 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
final Segment indexSegment, final Segment indexSegment,
SeedDB peers, SeedDB peers,
final TextSnippet textSnippet) { final TextSnippet textSnippet) {
this.urlentry = urlentry; super(urlentry);
this.urlentry.setField(CollectionSchema.text_t.getSolrFieldName(), ""); // clear the text field which eats up most of the space; it was used for snippet computation which is in a separate field here this.removeFields(CollectionSchema.text_t.getSolrFieldName()); // clear the text field which eats up most of the space; it was used for snippet computation which is in a separate field here
this.indexSegment = indexSegment; this.indexSegment = indexSegment;
this.alternative_urlstring = null; this.alternative_urlstring = null;
this.alternative_urlname = null; this.alternative_urlname = null;
@ -102,7 +95,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
@Override @Override
public int hashCode() { public int hashCode() {
if (this.hashCache == Integer.MIN_VALUE) { if (this.hashCache == Integer.MIN_VALUE) {
this.hashCache = ByteArray.hashCode(this.urlentry.hash()); this.hashCache = ByteArray.hashCode(this.hash());
} }
return this.hashCache; return this.hashCache;
} }
@ -112,29 +105,18 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
if (obj == null) return false; if (obj == null) return false;
if (!(obj instanceof ResultEntry)) return false; if (!(obj instanceof ResultEntry)) return false;
ResultEntry other = (ResultEntry) obj; ResultEntry other = (ResultEntry) obj;
return Base64Order.enhancedCoder.equal(this.urlentry.hash(), other.urlentry.hash()); return Base64Order.enhancedCoder.equal(this.hash(), other.hash());
}
public URIMetadataNode getNode() {
return this.urlentry;
}
public byte[] hash() {
return this.urlentry.hash();
}
public DigestURL url() {
return this.urlentry.url();
}
public Bitfield flags() {
return this.urlentry.flags();
} }
public String urlstring() { public String urlstring() {
if (this.alternative_urlstring != null) return this.alternative_urlstring; if (this.alternative_urlstring != null) return this.alternative_urlstring;
if (!pdfParser.individualPages) return this.url().toNormalform(true); if (!pdfParser.individualPages) return this.url().toNormalform(true);
if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.urlentry.url().getFileName()).toLowerCase())) return this.url().toNormalform(true); if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase())) return this.url().toNormalform(true);
// for pdf links we rewrite the url // for pdf links we rewrite the url
// this is a special treatment of pdf files which can be splitted into subpages // this is a special treatment of pdf files which can be splitted into subpages
String pageprop = pdfParser.individualPagePropertyname; String pageprop = pdfParser.individualPagePropertyname;
String resultUrlstring = this.urlentry.url().toNormalform(true); String resultUrlstring = this.url().toNormalform(true);
int p = resultUrlstring.lastIndexOf(pageprop + "="); int p = resultUrlstring.lastIndexOf(pageprop + "=");
if (p > 0) { if (p > 0) {
return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1); return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1);
@ -145,72 +127,22 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
return (this.alternative_urlname == null) ? MultiProtocolURL.unescape(urlstring()) : this.alternative_urlname; return (this.alternative_urlname == null) ? MultiProtocolURL.unescape(urlstring()) : this.alternative_urlname;
} }
public String title() { public String title() {
String titlestr = this.urlentry.dc_title(); String titlestr = this.dc_title();
// if title is empty use filename as title // if title is empty use filename as title
if (titlestr.isEmpty()) { // if url has no filename, title is still empty (e.g. "www.host.com/" ) if (titlestr.isEmpty()) { // if url has no filename, title is still empty (e.g. "www.host.com/" )
titlestr = this.url() != null ? this.url().getFileName() : ""; titlestr = this.url() != null ? this.url().getFileName() : "";
} }
return titlestr; return titlestr;
} }
public String publisher() {
// dc:publisher
return this.urlentry.dc_publisher();
}
public String creator() {
// dc:creator, the author
return this.urlentry.dc_creator();
}
public String subject() {
// dc:subject, keywords
return this.urlentry.dc_subject();
}
public TextSnippet textSnippet() { public TextSnippet textSnippet() {
return this.textSnippet; return this.textSnippet;
} }
public Date modified() {
return this.urlentry.moddate();
}
public Date[] events() { public Date[] events() {
return this.urlentry.datesInContent(); return this.datesInContent();
}
public int filesize() {
return this.urlentry.filesize();
} }
public int referencesCount() { public int referencesCount() {
// urlCitationIndex index might be null (= configuration option) // urlCitationIndex index might be null (= configuration option)
return this.indexSegment.connectedCitation() ? this.indexSegment.urlCitation().count(this.urlentry.hash()) : 0; return this.indexSegment.connectedCitation() ? this.indexSegment.urlCitation().count(this.hash()) : 0;
}
public int llocal() {
return this.urlentry.llocal();
}
public int lother() {
return this.urlentry.lother();
}
public int limage() {
return this.urlentry.limage();
}
public int laudio() {
return this.urlentry.laudio();
}
public int lvideo() {
return this.urlentry.lvideo();
}
public int lapp() {
return this.urlentry.lapp();
}
public double lat() {
return this.urlentry.lat();
}
public double lon() {
return this.urlentry.lon();
}
public WordReference word() {
final Reference word = this.urlentry.word();
if (word == null) return null;
if (word instanceof WordReferenceVars) return (WordReferenceVars) word;
if (word instanceof WordReferenceRow) return (WordReferenceRow) word;
assert word instanceof WordReferenceRow || word instanceof WordReferenceVars : word == null ? "word = null" : "type = " + word.getClass().getCanonicalName();
return null;
} }
public boolean hasTextSnippet() { public boolean hasTextSnippet() {
return (this.textSnippet != null) && (!this.textSnippet.getErrorCode().fail()); return (this.textSnippet != null) && (!this.textSnippet.getErrorCode().fail());
@ -218,19 +150,16 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
public String resource() { public String resource() {
// generate transport resource // generate transport resource
if ((this.textSnippet == null) || (!this.textSnippet.exists())) { if ((this.textSnippet == null) || (!this.textSnippet.exists())) {
return this.urlentry.toString(); return this.toString();
} }
return this.urlentry.toString(this.textSnippet.getLineRaw()); return this.toString(this.textSnippet.getLineRaw());
} }
@Override @Override
public int compareTo(ResultEntry o) { public int compareTo(ResultEntry o) {
return Base64Order.enhancedCoder.compare(this.urlentry.hash(), o.urlentry.hash()); return Base64Order.enhancedCoder.compare(this.hash(), o.hash());
} }
@Override @Override
public int compare(ResultEntry o1, ResultEntry o2) { public int compare(ResultEntry o1, ResultEntry o2) {
return Base64Order.enhancedCoder.compare(o1.urlentry.hash(), o2.urlentry.hash()); return Base64Order.enhancedCoder.compare(o1.hash(), o2.hash());
}
public float score() {
return this.urlentry.score();
} }
} }

Loading…
Cancel
Save