- get nice text_t values from metadata conversions that are stored into

solr as fulltext search index.
- added slow migration from old metadata to solr index entries: each
entry from the old metadata is removed from that data structure and
written into solr.
pull/1/head
orbiter 13 years ago
parent 99ef57f103
commit d7ea45f698

@ -211,15 +211,26 @@ public final class Fulltext implements Iterable<byte[]> {
// get the metadata from Solr
try {
SolrDocument doc = this.solr.get(ASCII.String(urlHash));
if (doc != null) return new URIMetadataNode(doc, wre, weight);
if (doc != null) {
if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash);
return new URIMetadataNode(doc, wre, weight);
}
} catch (IOException e) {
Log.logException(e);
}
// get the metadata from the old metadata index
if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry != null) return new URIMetadataRow(entry, wre, weight);
if (this.connectedSolr()) {
// slow migration to solr
final Row.Entry entry = this.urlIndexFile.remove(urlHash);
if (entry == null) return null;
URIMetadataRow row = new URIMetadataRow(entry, wre, weight);
this.putDocument(this.solrScheme.metadata2solr(row));
return row;
}
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry != null) return new URIMetadataRow(entry, wre, weight);
} catch (final IOException e) {
Log.logException(e);
}
@ -242,14 +253,25 @@ public final class Fulltext implements Iterable<byte[]> {
// get the document from Solr
try {
SolrDocument doc = this.solr.get(ASCII.String(urlHash));
if (doc != null) return doc;
if (doc != null) {
if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash);
return doc;
}
} catch (IOException e) {
Log.logException(e);
}
// get the document from the old metadata index
if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (this.connectedSolr()) {
// slow migration to solr
final Row.Entry entry = this.urlIndexFile.remove(urlHash);
if (entry == null) return null;
URIMetadataRow row = new URIMetadataRow(entry, wre, weight);
this.putDocument(this.solrScheme.metadata2solr(row));
return ClientUtils.toSolrDocument(getSolrScheme().metadata2solr(row));
}
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry == null) return null;
return ClientUtils.toSolrDocument(getSolrScheme().metadata2solr(new URIMetadataRow(entry, wre, weight)));
} catch (final IOException e) {

@ -206,16 +206,16 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.description)) addSolr(solrdoc, YaCySchema.description, md.snippet());
if (allAttr || contains(YaCySchema.content_type)) addSolr(solrdoc, YaCySchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype()));
if (allAttr || contains(YaCySchema.last_modified)) addSolr(solrdoc, YaCySchema.last_modified, md.moddate());
if (allAttr || contains(YaCySchema.text_t)) addSolr(solrdoc, YaCySchema.text_t, ""); // not delivered in metadata
if (allAttr || contains(YaCySchema.wordcount_i)) addSolr(solrdoc, YaCySchema.wordcount_i, md.wordCount());
if (allAttr || contains(YaCySchema.keywords)) {
String keywords = md.dc_subject();
Bitfield flags = md.flags();
if (flags.get(Condenser.flag_cat_indexof)) {
if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else {
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
}
}
String keywords = md.dc_subject();
Bitfield flags = md.flags();
if (flags.get(Condenser.flag_cat_indexof)) {
if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else {
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
}
}
if (allAttr || contains(YaCySchema.keywords)) {
addSolr(solrdoc, YaCySchema.keywords, keywords);
}
@ -250,9 +250,28 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.audiolinkscount_i)) addSolr(solrdoc, YaCySchema.audiolinkscount_i, md.laudio());
if (allAttr || contains(YaCySchema.videolinkscount_i)) addSolr(solrdoc, YaCySchema.videolinkscount_i, md.lvideo());
if (allAttr || contains(YaCySchema.applinkscount_i)) addSolr(solrdoc, YaCySchema.applinkscount_i, md.lapp());
if (allAttr || contains(YaCySchema.text_t)) {
// construct the text from other metadata parts.
// This is necessary here since that is used to search the link when no other data (parsed text body) is available
StringBuilder sb = new StringBuilder(120);
accText(sb, md.dc_title());
accText(sb, md.dc_creator());
accText(sb, md.dc_publisher());
accText(sb, md.snippet());
accText(sb, digestURI.toTokens());
accText(sb, keywords);
addSolr(solrdoc, YaCySchema.text_t, sb.toString());
}
return solrdoc;
}
private static void accText(final StringBuilder sb, String text) {
if (text == null || text.length() == 0) return;
if (sb.length() != 0) sb.append(' ');
text = text.trim();
if (text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
public SolrDoc yacy2solr(final String id, final ResponseHeader header, final Document yacydoc, final URIMetadata metadata) {
// we use the SolrCell design as index scheme

Loading…
Cancel
Save