Merge branch 'master' of git://gitorious.org/~reger/yacy/bbyacy-rc1

pull/1/head
Michael Peter Christen 13 years ago
commit d88eb657fd

@ -79,7 +79,7 @@ public class Document {
private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
private final StringBuilder creator; // author or copyright private final StringBuilder creator; // author or copyright
private final String publisher; // publisher private final String publisher; // publisher
private List<String> sections; // if present: more titles/headlines appearing in the document private final List<String> sections; // if present: more titles/headlines appearing in the document
private final StringBuilder description; // an abstract, if present: short content description private final StringBuilder description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible private Object text; // the clear text, all that is visible
private final Map<MultiProtocolURI, Properties> anchors; // all links embedded as clickeable entities (anchor tags) private final Map<MultiProtocolURI, Properties> anchors; // all links embedded as clickeable entities (anchor tags)
@ -116,9 +116,10 @@ public class Document {
this.parserObject = parserObject; this.parserObject = parserObject;
this.keywords = new LinkedList<String>(); this.keywords = new LinkedList<String>();
if (keywords != null) this.keywords.addAll(Arrays.asList(keywords)); if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
this.titles = titles; this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections); this.sections = new LinkedList<String>() ;
if (sections != null) this.sections.addAll(Arrays.asList(sections));
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct); this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
this.lon = lon; this.lon = lon;
this.lat = lat; this.lat = lat;
@ -631,17 +632,7 @@ dc_rights
public void addSubDocuments(final Document[] docs) throws IOException { public void addSubDocuments(final Document[] docs) throws IOException {
for (final Document doc: docs) { for (final Document doc: docs) {
// check class as addAll method might not be available if initialized via Arrays.toList
if (this.sections.getClass() == java.util.LinkedList.class) {
this.sections.addAll(doc.sections); this.sections.addAll(doc.sections);
} else {
/* sections might be initialized via Arrays.toList (which does not implement the addAll method)
so new list must be assigned */
LinkedList<String> tmplist = new LinkedList<String>();
tmplist.addAll(this.sections);
tmplist.addAll(doc.sections);
this.sections = tmplist;
}
this.titles.addAll(doc.titles()); this.titles.addAll(doc.titles());
this.keywords.addAll(doc.getKeywords()); this.keywords.addAll(doc.getKeywords());

@ -35,7 +35,7 @@ public class AugmentParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException {
Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source); Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);
try { try {
@ -44,48 +44,26 @@ public class AugmentParser extends AbstractParser implements Parser {
Log.logException(e); Log.logException(e);
} }
Document alreadyParsedDocument = htmlDocs[0]; for (final Document doc : htmlDocs) {
Document superDoc = analyze(alreadyParsedDocument, url, mimeType, charset); /* analyze(doc, url, mimeType, charset); // enrich document text */
Document augmentDoc = parseAndAugment(url, mimeType, charset); parseAndAugment(doc, url, mimeType, charset); // enrich document with additional tags
Document[] retDocs = new Document[htmlDocs.length + 1];
for (int i = 1; i < htmlDocs.length; i++) {
retDocs[i - 1] = htmlDocs[i];
} }
return htmlDocs;
retDocs[retDocs.length - 1] = augmentDoc;
retDocs[retDocs.length - 2] = superDoc;
try { // merge additional result docs into the parse main document
alreadyParsedDocument.addSubDocuments(retDocs);
} catch (IOException ex) {
Log.logException(ex);
}
Document[] finalretDocs = new Document[1]; // return the merged document
finalretDocs[0] = alreadyParsedDocument;
return finalretDocs;
} }
private static Document analyze (Document alreadyParsedDocument, DigestURI url, /* TODO: not implemented yet
*
private void analyze(Document origDoc, DigestURI url,
String mimeType, String charset) { String mimeType, String charset) {
Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, null, null, null, null, false);
// if the magic word appears in the document, perform extra actions. // if the magic word appears in the document, perform extra actions.
if (alreadyParsedDocument.getKeywords().contains("magicword")) { if (origDoc.getKeywords().contains("magicword")) {
String all = ""; String all = "";
all = "yacylatest"; all = "yacylatest";
newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "", // TODO: append content of string all to origDoc.text, maybe use Document.mergeDocuments() to do so
"", null, "", 0, 0, all, null, null, null, false);
} }
return newDoc;
} }
*/
private Document parseAndAugment(DigestURI url, String mimeType, String charset) { private void parseAndAugment(Document origDoc, DigestURI url, String mimeType, String charset) {
String all = "";
Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
Iterator<net.yacy.kelondro.blob.Tables.Row> it; Iterator<net.yacy.kelondro.blob.Tables.Row> it;
try { try {
@ -93,20 +71,18 @@ public class AugmentParser extends AbstractParser implements Parser {
it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator(); it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator();
while (it.hasNext()) { while (it.hasNext()) {
net.yacy.kelondro.blob.Tables.Row r = it.next(); net.yacy.kelondro.blob.Tables.Row r = it.next();
if (r.get("url", "").equals (url.toNormalform(false))) { if (r.get("url", "").equals(url.toNormalform(false))) {
Set<String> tags = new HashSet<String>(); Set<String> tags = new HashSet<String>();
for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) { for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) {
tags.add(s); tags.add(s);
} }
newDoc.addTags(tags); origDoc.addTags(tags);
} }
} }
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);
} }
return newDoc;
} }

Loading…
Cancel
Save