added author tag to indexing content

enhanced composition of title tag
TODO: insert author information for external parsers

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3488 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 6be57983a8
commit a738b57b31

@ -238,7 +238,8 @@ public class yacysearch {
map.put("url", comp.url().toNormalform().replace(',', '|')); map.put("url", comp.url().toNormalform().replace(',', '|'));
map.put("title", comp.descr().replace(',', ' ')); map.put("title", comp.descr().replace(',', ' '));
map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' ')); map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' '));
map.put("tags", ((document == null) ? "" : document.getKeywords(' '))); map.put("author", ((document == null) ? "" : document.getAuthor()));
map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map)); yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
document.close(); document.close();
} }

@ -275,8 +275,22 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public String getTitle() { public String getTitle() {
// construct a title string, even if the document has no title // construct a title string, even if the document has no title
// if there is one, return it
if (title.length() > 0) return title; // some documents have a title tag as meta tag
String s = (String) metas.get("title");
// try to construct the title with the content of the title tag
if (title.length() > 0) {
if (s == null) {
return title;
} else {
if ((title.compareToIgnoreCase(s) == 0) || (title.indexOf(s) >= 0)) return s; else return title + ": " + s;
}
} else {
if (s != null) {
return s;
}
}
// othervise take any headline // othervise take any headline
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
@ -284,7 +298,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
} }
// take description tag // take description tag
String s = getDescription(); s = getDescription();
if (s.length() > 0) return s; if (s.length() > 0) return s;
// extract headline from content // extract headline from content
@ -336,9 +350,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (s == null) return ""; else return s; if (s == null) return ""; else return s;
} }
public String getCopyright() { public String getAuthor() {
String s = (String) metas.get("copyright"); String s = (String) metas.get("author");
if (s == null) return ""; else return s; if (s == null) s = (String) metas.get("copyright");
if (s == null) return "";
return s;
} }
public String[] getContentLanguages() { public String[] getContentLanguages() {

@ -97,6 +97,7 @@ implements Parser {
replaceAll("\r"," "). replaceAll("\r"," ").
replaceAll("\t"," "), replaceAll("\t"," "),
null, null,
"", // TODO: AUTHOR
null, null,
null, null,
contents.getBytes("UTF-8"), contents.getBytes("UTF-8"),

@ -173,6 +173,7 @@ public class odtParser extends AbstractParser implements Parser {
docKeywords, docKeywords,
docShortTitle, docShortTitle,
docLongTitle, docLongTitle,
"", // TODO: AUTHOR
null, null,
docDescription, docDescription,
contentBytes, contentBytes,
@ -186,6 +187,7 @@ public class odtParser extends AbstractParser implements Parser {
docKeywords, docKeywords,
docShortTitle, docShortTitle,
docLongTitle, docLongTitle,
"", // TODO: AUTHOR
null, null,
docDescription, docDescription,
writerFile, writerFile,

@ -158,6 +158,7 @@ public class pdfParser extends AbstractParser implements Parser {
docKeywords, docKeywords,
docSubject, docSubject,
docTitle, docTitle,
"", // TODO: AUTHOR
null, null,
null, null,
contentBytes, contentBytes,
@ -171,6 +172,7 @@ public class pdfParser extends AbstractParser implements Parser {
docKeywords, docKeywords,
docSubject, docSubject,
docTitle, docTitle,
"", // TODO: AUTHOR
null, null,
null, null,
writerFile, writerFile,

@ -114,6 +114,7 @@ public class pptParser extends AbstractParser implements Parser {
replaceAll("\r"," "). replaceAll("\r"," ").
replaceAll("\t"," "), replaceAll("\t"," "),
null, null,
"", // TODO: AUTHOR
null, null,
null, null,
contents.getBytes("UTF-8"), contents.getBytes("UTF-8"),

@ -152,6 +152,7 @@ public class rpmParser extends AbstractParser implements Parser {
null, null,
name, name,
summary, summary,
"", // TODO: AUTHOR
null, null,
description, description,
content.toString().getBytes("UTF-8"), content.toString().getBytes("UTF-8"),

@ -190,6 +190,7 @@ public class rssParser extends AbstractParser implements Parser {
null, null,
null, null,
feedTitle, feedTitle,
"", // TODO: AUTHOR
(String[]) feedSections.toArray(new String[feedSections.size()]), (String[]) feedSections.toArray(new String[feedSections.size()]),
feedDescription, feedDescription,
text.getBytes(), text.getBytes(),

@ -102,6 +102,7 @@ implements Parser {
replaceAll("\r"," "). replaceAll("\r"," ").
replaceAll("\t"," "), replaceAll("\t"," "),
null, null,
"", // TODO: AUTHOR
null, null,
null, null,
bodyText.getBytes("UTF-8"), bodyText.getBytes("UTF-8"),

@ -135,6 +135,7 @@ public class swfParser extends AbstractParser implements Parser {
replaceAll("\r"," "). replaceAll("\r"," ").
replaceAll("\t"," "), //short title replaceAll("\t"," "), //short title
longTitle, // a long document title longTitle, // a long document title
"", // TODO: AUTHOR
sections, // an array of section headlines sections, // an array of section headlines
abstrct, // an abstract abstrct, // an abstract
contents.getBytes("UTF-8"), // the parsed document text contents.getBytes("UTF-8"), // the parsed document text

@ -213,6 +213,7 @@ public class tarParser extends AbstractParser implements Parser {
docKeywords.toString().split(" |,"), docKeywords.toString().split(" |,"),
docShortTitle.toString(), docShortTitle.toString(),
docLongTitle.toString(), docLongTitle.toString(),
"", // TODO: AUTHOR
(String[])docSections.toArray(new String[docSections.size()]), (String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(), docAbstrct.toString(),
((serverByteBuffer)docText).toByteArray(), ((serverByteBuffer)docText).toByteArray(),
@ -226,6 +227,7 @@ public class tarParser extends AbstractParser implements Parser {
docKeywords.toString().split(" |,"), docKeywords.toString().split(" |,"),
docShortTitle.toString(), docShortTitle.toString(),
docLongTitle.toString(), docLongTitle.toString(),
"", // TODO: AUTHOR
(String[])docSections.toArray(new String[docSections.size()]), (String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(), docAbstrct.toString(),
outputFile, outputFile,

@ -247,6 +247,7 @@ public class vcfParser extends AbstractParser implements Parser {
null, // a list of extracted keywords null, // a list of extracted keywords
null, // a short document title null, // a short document title
parsedTitle.toString(), // a long document title parsedTitle.toString(), // a long document title
"", // TODO: AUTHOR
sections, // an array of section headlines sections, // an array of section headlines
"vCard", // an abstract "vCard", // an abstract
text, // the parsed document text text, // the parsed document text

@ -142,6 +142,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener {
replaceAll("\r"," "). replaceAll("\r"," ").
replaceAll("\t"," "), replaceAll("\t"," "),
null, null,
"", // TODO: AUTHOR
null, null,
null, null,
contents.getBytes("UTF-8"), contents.getBytes("UTF-8"),

@ -197,6 +197,7 @@ public class zipParser extends AbstractParser implements Parser {
docKeywords.toString().split(" |,"), docKeywords.toString().split(" |,"),
docShortTitle.toString(), docShortTitle.toString(),
docLongTitle.toString(), docLongTitle.toString(),
"", // TODO: AUTHOR
(String[])docSections.toArray(new String[docSections.size()]), (String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(), docAbstrct.toString(),
((serverByteBuffer)docText).toByteArray(), ((serverByteBuffer)docText).toByteArray(),
@ -210,6 +211,7 @@ public class zipParser extends AbstractParser implements Parser {
docKeywords.toString().split(" |,"), docKeywords.toString().split(" |,"),
docShortTitle.toString(), docShortTitle.toString(),
docLongTitle.toString(), docLongTitle.toString(),
"", // TODO: AUTHOR
(String[])docSections.toArray(new String[docSections.size()]), (String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(), docAbstrct.toString(),
outputFile, outputFile,

@ -154,7 +154,7 @@ public final class plasmaCondenser {
insertTextToWords(document.getMainLongTitle(), 1, indexRWIEntryNew.flag_app_descr, wflags); insertTextToWords(document.getMainLongTitle(), 1, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags); insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getAbstract(), 3, indexRWIEntryNew.flag_app_descr, wflags); insertTextToWords(document.getAbstract(), 3, indexRWIEntryNew.flag_app_descr, wflags);
// missing: author! insertTextToWords(document.getAuthor(), 4, indexRWIEntryNew.flag_app_descr, wflags);
// missing: tags! // missing: tags!
String[] titles = document.getSectionTitles(); String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) { for (int i = 0; i < titles.length; i++) {

@ -753,11 +753,19 @@ public final class plasmaParser {
String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0; int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
plasmaParserDocument ppd = new plasmaParserDocument(new URL(location.toNormalform()), plasmaParserDocument ppd = new plasmaParserDocument(
mimeType, charSet, scraper.getKeywords(), new URL(location.toNormalform()),
scraper.getTitle(), scraper.getTitle(), mimeType,
sections, scraper.getDescription(), charSet,
scraper.getText(), scraper.getAnchors(), scraper.getImages()); scraper.getKeywords(),
scraper.getTitle(),
scraper.getTitle(),
scraper.getAuthor(),
sections,
scraper.getDescription(),
scraper.getText(),
scraper.getAnchors(),
scraper.getImages());
//scraper.close(); //scraper.close();
return ppd; return ppd;
} catch (MalformedURLException e) { } catch (MalformedURLException e) {

@ -66,6 +66,7 @@ public class plasmaParserDocument {
private String[] keywords; // most resources provide a keyword field private String[] keywords; // most resources provide a keyword field
private String shortTitle; // a shortTitle mostly appears in the window header (border) private String shortTitle; // a shortTitle mostly appears in the window header (border)
private String longTitle; // the real title of the document, commonly h1-tags private String longTitle; // the real title of the document, commonly h1-tags
private String author; // author or copyright
private String[] sections; // if present: more titles/headlines appearing in the document private String[] sections; // if present: more titles/headlines appearing in the document
private String abstrct; // an abstract, if present: short content description private String abstrct; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible private Object text; // the clear text, all that is visible
@ -80,7 +81,7 @@ public class plasmaParserDocument {
private InputStream textStream; private InputStream textStream;
public plasmaParserDocument(URL location, String mimeType, String charset, public plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String shortTitle, String longTitle, String[] keywords, String shortTitle, String longTitle, String author,
String[] sections, String abstrct, String[] sections, String abstrct,
byte[] text, Map anchors, TreeSet images) { byte[] text, Map anchors, TreeSet images) {
this.location = location; this.location = location;
@ -89,6 +90,7 @@ public class plasmaParserDocument {
this.keywords = (keywords==null) ? new String[0] : keywords; this.keywords = (keywords==null) ? new String[0] : keywords;
this.shortTitle = (shortTitle==null)?"":shortTitle; this.shortTitle = (shortTitle==null)?"":shortTitle;
this.longTitle = (longTitle==null)?"":longTitle; this.longTitle = (longTitle==null)?"":longTitle;
this.author = (author==null)?"":author;
this.sections = (sections==null)?new String[0]:sections; this.sections = (sections==null)?new String[0]:sections;
this.abstrct = (abstrct==null)?"":abstrct; this.abstrct = (abstrct==null)?"":abstrct;
this.text = (text==null)?new byte[0]:text; this.text = (text==null)?new byte[0]:text;
@ -103,7 +105,7 @@ public class plasmaParserDocument {
} }
public plasmaParserDocument(URL location, String mimeType, String charset, public plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String shortTitle, String longTitle, String[] keywords, String shortTitle, String longTitle, String author,
String[] sections, String abstrct, String[] sections, String abstrct,
File text, Map anchors, TreeSet images) { File text, Map anchors, TreeSet images) {
this.location = location; this.location = location;
@ -112,6 +114,7 @@ public class plasmaParserDocument {
this.keywords = (keywords==null) ? new String[0] : keywords; this.keywords = (keywords==null) ? new String[0] : keywords;
this.shortTitle = (shortTitle==null)?"":shortTitle; this.shortTitle = (shortTitle==null)?"":shortTitle;
this.longTitle = (longTitle==null)?"":longTitle; this.longTitle = (longTitle==null)?"":longTitle;
this.author = (author==null)?"":author;
this.sections = (sections==null)?new String[0]:sections; this.sections = (sections==null)?new String[0]:sections;
this.abstrct = (abstrct==null)?"":abstrct; this.abstrct = (abstrct==null)?"":abstrct;
this.text = text; this.text = text;
@ -157,6 +160,10 @@ public class plasmaParserDocument {
if (abstrct != null) return abstrct; else return getMainLongTitle(); if (abstrct != null) return abstrct; else return getMainLongTitle();
} }
public String getAuthor() {
if (author != null) return author; else return "";
}
public InputStream getText() { public InputStream getText() {
try { try {
if (this.text == null) return null; if (this.text == null) return null;

@ -1645,6 +1645,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return false; return false;
} }
if ((sbQueue.size() == 0) && ((getThread(CRAWLJOB_LOCAL_CRAWL).getJobCount() == 0))) setPerformance((int) Math.max(120, 60000 / getConfigLong(INDEX_DIST_BUSYSLEEP, 6000))); // if there is no activity, set low performance
// flush some entries from the RAM cache // flush some entries from the RAM cache
if (sbQueue.size() == 0) wordIndex.flushCacheSome(); // permanent flushing only if we are not busy if (sbQueue.size() == 0) wordIndex.flushCacheSome(); // permanent flushing only if we are not busy
wordIndex.loadedURL.flushCacheSome(); wordIndex.loadedURL.flushCacheSome();
@ -2227,8 +2229,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
indexURLEntry newEntry = wordIndex.loadedURL.newEntry( indexURLEntry newEntry = wordIndex.loadedURL.newEntry(
entry.url(), // URL entry.url(), // URL
docDescription, // document description docDescription, // document description
"", // author document.getAuthor(), // author
"", // tags document.getKeywords(' '), // tags
"", // ETag "", // ETag
docDate, // modification date docDate, // modification date
new Date(), // loaded date new Date(), // loaded date

@ -503,7 +503,7 @@ filterOutStopwordsFromTopwords=true
80_indexing_busysleep=50 80_indexing_busysleep=50
80_indexing_memprereq=4194304 80_indexing_memprereq=4194304
82_crawlstack_idlesleep=5000 82_crawlstack_idlesleep=5000
82_crawlstack_busysleep=0 82_crawlstack_busysleep=10
82_crawlstack_memprereq=1048576 82_crawlstack_memprereq=1048576
90_cleanup_idlesleep=300000 90_cleanup_idlesleep=300000
90_cleanup_busysleep=300000 90_cleanup_busysleep=300000

Loading…
Cancel
Save