diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 4f8cd191f..10de09dfd 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -238,7 +238,8 @@ public class yacysearch { map.put("url", comp.url().toNormalform().replace(',', '|')); map.put("title", comp.descr().replace(',', ' ')); map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' ')); - map.put("tags", ((document == null) ? "" : document.getKeywords(' '))); + map.put("author", ((document == null) ? "" : document.getAuthor())); + map.put("tags", ((document == null) ? "" : document.getKeywords(' '))); yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map)); document.close(); } diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 54fd5252a..e6a6610f4 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -275,8 +275,22 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public String getTitle() { // construct a title string, even if the document has no title - // if there is one, return it - if (title.length() > 0) return title; + + // some documents have a title tag as meta tag + String s = (String) metas.get("title"); + + // try to construct the title with the content of the title tag + if (title.length() > 0) { + if (s == null) { + return title; + } else { + if ((title.compareToIgnoreCase(s) == 0) || (title.indexOf(s) >= 0)) return s; else return title + ": " + s; + } + } else { + if (s != null) { + return s; + } + } // othervise take any headline for (int i = 0; i < 4; i++) { @@ -284,7 +298,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } // take description tag - String s = getDescription(); + s = getDescription(); if (s.length() > 0) return s; // extract headline from content @@ -336,9 +350,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (s == null) return ""; else return s; } - public String getCopyright() { - String s = (String) metas.get("copyright"); - if (s == null) return ""; else return s; + public String getAuthor() { + String s = (String) metas.get("author"); + if (s == null) s = (String) metas.get("copyright"); + if (s == null) return ""; + return s; } public String[] getContentLanguages() { diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index 92c116b4c..1ccc83991 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -97,6 +97,7 @@ implements Parser { replaceAll("\r"," "). replaceAll("\t"," "), null, + "", // TODO: AUTHOR null, null, contents.getBytes("UTF-8"), diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index eedbac6ec..7851f038c 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -173,6 +173,7 @@ public class odtParser extends AbstractParser implements Parser { docKeywords, docShortTitle, docLongTitle, + "", // TODO: AUTHOR null, docDescription, contentBytes, @@ -186,6 +187,7 @@ public class odtParser extends AbstractParser implements Parser { docKeywords, docShortTitle, docLongTitle, + "", // TODO: AUTHOR null, docDescription, writerFile, diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 174d8fbd9..6cc8bf644 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -158,6 +158,7 @@ public class pdfParser extends AbstractParser implements Parser { docKeywords, docSubject, docTitle, + "", // TODO: AUTHOR null, null, contentBytes, @@ -171,6 +172,7 @@ public class pdfParser extends AbstractParser implements Parser { docKeywords, docSubject, docTitle, + "", // TODO: AUTHOR null, null, writerFile, diff --git a/source/de/anomic/plasma/parser/ppt/pptParser.java b/source/de/anomic/plasma/parser/ppt/pptParser.java index 456195617..223bc2ef5 100644 --- a/source/de/anomic/plasma/parser/ppt/pptParser.java +++ b/source/de/anomic/plasma/parser/ppt/pptParser.java @@ -114,6 +114,7 @@ public class pptParser extends AbstractParser implements Parser { replaceAll("\r"," "). replaceAll("\t"," "), null, + "", // TODO: AUTHOR null, null, contents.getBytes("UTF-8"), diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index 681a0a943..f7615e268 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -152,6 +152,7 @@ public class rpmParser extends AbstractParser implements Parser { null, name, summary, + "", // TODO: AUTHOR null, description, content.toString().getBytes("UTF-8"), diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index dbf3d11ee..b5d4ce614 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -190,6 +190,7 @@ public class rssParser extends AbstractParser implements Parser { null, null, feedTitle, + "", // TODO: AUTHOR (String[]) feedSections.toArray(new String[feedSections.size()]), feedDescription, text.getBytes(), diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index de5e3ff72..4e6f047b6 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -102,6 +102,7 @@ implements Parser { replaceAll("\r"," "). replaceAll("\t"," "), null, + "", // TODO: AUTHOR null, null, bodyText.getBytes("UTF-8"), diff --git a/source/de/anomic/plasma/parser/swf/swfParser.java b/source/de/anomic/plasma/parser/swf/swfParser.java index 6428b009e..d225ca40c 100644 --- a/source/de/anomic/plasma/parser/swf/swfParser.java +++ b/source/de/anomic/plasma/parser/swf/swfParser.java @@ -135,6 +135,7 @@ public class swfParser extends AbstractParser implements Parser { replaceAll("\r"," "). replaceAll("\t"," "), //short title longTitle, // a long document title + "", // TODO: AUTHOR sections, // an array of section headlines abstrct, // an abstract contents.getBytes("UTF-8"), // the parsed document text diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index 4f066232a..5c1604e41 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -213,6 +213,7 @@ public class tarParser extends AbstractParser implements Parser { docKeywords.toString().split(" |,"), docShortTitle.toString(), docLongTitle.toString(), + "", // TODO: AUTHOR (String[])docSections.toArray(new String[docSections.size()]), docAbstrct.toString(), ((serverByteBuffer)docText).toByteArray(), @@ -226,6 +227,7 @@ public class tarParser extends AbstractParser implements Parser { docKeywords.toString().split(" |,"), docShortTitle.toString(), docLongTitle.toString(), + "", // TODO: AUTHOR (String[])docSections.toArray(new String[docSections.size()]), docAbstrct.toString(), outputFile, diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index f553d5032..defbdc43c 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -247,6 +247,7 @@ public class vcfParser extends AbstractParser implements Parser { null, // a list of extracted keywords null, // a short document title parsedTitle.toString(), // a long document title + "", // TODO: AUTHOR sections, // an array of section headlines "vCard", // an abstract text, // the parsed document text diff --git a/source/de/anomic/plasma/parser/xls/xlsParser.java b/source/de/anomic/plasma/parser/xls/xlsParser.java index c5a6df98e..1eeca049b 100644 --- a/source/de/anomic/plasma/parser/xls/xlsParser.java +++ b/source/de/anomic/plasma/parser/xls/xlsParser.java @@ -142,6 +142,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener { replaceAll("\r"," "). replaceAll("\t"," "), null, + "", // TODO: AUTHOR null, null, contents.getBytes("UTF-8"), diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index e672df7dd..55e56a54a 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -197,6 +197,7 @@ public class zipParser extends AbstractParser implements Parser { docKeywords.toString().split(" |,"), docShortTitle.toString(), docLongTitle.toString(), + "", // TODO: AUTHOR (String[])docSections.toArray(new String[docSections.size()]), docAbstrct.toString(), ((serverByteBuffer)docText).toByteArray(), @@ -210,6 +211,7 @@ public class zipParser extends AbstractParser implements Parser { docKeywords.toString().split(" |,"), docShortTitle.toString(), docLongTitle.toString(), + "", // TODO: AUTHOR (String[])docSections.toArray(new String[docSections.size()]), docAbstrct.toString(), outputFile, diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 54ea93751..9c5e37535 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -154,7 +154,7 @@ public final class plasmaCondenser { insertTextToWords(document.getMainLongTitle(), 1, indexRWIEntryNew.flag_app_descr, wflags); insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags); insertTextToWords(document.getAbstract(), 3, indexRWIEntryNew.flag_app_descr, wflags); - // missing: author! + insertTextToWords(document.getAuthor(), 4, indexRWIEntryNew.flag_app_descr, wflags); // missing: tags! String[] titles = document.getSectionTitles(); for (int i = 0; i < titles.length; i++) { diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index e6d9b9486..70a9fcfd6 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -753,11 +753,19 @@ public final class plasmaParser { String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; int p = 0; for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; - plasmaParserDocument ppd = new plasmaParserDocument(new URL(location.toNormalform()), - mimeType, charSet, scraper.getKeywords(), - scraper.getTitle(), scraper.getTitle(), - sections, scraper.getDescription(), - scraper.getText(), scraper.getAnchors(), scraper.getImages()); + plasmaParserDocument ppd = new plasmaParserDocument( + new URL(location.toNormalform()), + mimeType, + charSet, + scraper.getKeywords(), + scraper.getTitle(), + scraper.getTitle(), + scraper.getAuthor(), + sections, + scraper.getDescription(), + scraper.getText(), + scraper.getAnchors(), + scraper.getImages()); //scraper.close(); return ppd; } catch (MalformedURLException e) { diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 787d2077c..727c828ab 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -66,6 +66,7 @@ public class plasmaParserDocument { private String[] keywords; // most resources provide a keyword field private String shortTitle; // a shortTitle mostly appears in the window header (border) private String longTitle; // the real title of the document, commonly h1-tags + private String author; // author or copyright private String[] sections; // if present: more titles/headlines appearing in the document private String abstrct; // an abstract, if present: short content description private Object text; // the clear text, all that is visible @@ -80,7 +81,7 @@ public class plasmaParserDocument { private InputStream textStream; public plasmaParserDocument(URL location, String mimeType, String charset, - String[] keywords, String shortTitle, String longTitle, + String[] keywords, String shortTitle, String longTitle, String author, String[] sections, String abstrct, byte[] text, Map anchors, TreeSet images) { this.location = location; @@ -89,6 +90,7 @@ public class plasmaParserDocument { this.keywords = (keywords==null) ? new String[0] : keywords; this.shortTitle = (shortTitle==null)?"":shortTitle; this.longTitle = (longTitle==null)?"":longTitle; + this.author = (author==null)?"":author; this.sections = (sections==null)?new String[0]:sections; this.abstrct = (abstrct==null)?"":abstrct; this.text = (text==null)?new byte[0]:text; @@ -103,7 +105,7 @@ public class plasmaParserDocument { } public plasmaParserDocument(URL location, String mimeType, String charset, - String[] keywords, String shortTitle, String longTitle, + String[] keywords, String shortTitle, String longTitle, String author, String[] sections, String abstrct, File text, Map anchors, TreeSet images) { this.location = location; @@ -112,6 +114,7 @@ public class plasmaParserDocument { this.keywords = (keywords==null) ? new String[0] : keywords; this.shortTitle = (shortTitle==null)?"":shortTitle; this.longTitle = (longTitle==null)?"":longTitle; + this.author = (author==null)?"":author; this.sections = (sections==null)?new String[0]:sections; this.abstrct = (abstrct==null)?"":abstrct; this.text = text; @@ -157,6 +160,10 @@ public class plasmaParserDocument { if (abstrct != null) return abstrct; else return getMainLongTitle(); } + public String getAuthor() { + if (author != null) return author; else return ""; + } + public InputStream getText() { try { if (this.text == null) return null; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 194b5d73a..f916ed0f7 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1645,6 +1645,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return false; } + if ((sbQueue.size() == 0) && ((getThread(CRAWLJOB_LOCAL_CRAWL).getJobCount() == 0))) setPerformance((int) Math.max(120, 60000 / getConfigLong(INDEX_DIST_BUSYSLEEP, 6000))); // if there is no activity, set low performance + // flush some entries from the RAM cache if (sbQueue.size() == 0) wordIndex.flushCacheSome(); // permanent flushing only if we are not busy wordIndex.loadedURL.flushCacheSome(); @@ -2227,8 +2229,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser indexURLEntry newEntry = wordIndex.loadedURL.newEntry( entry.url(), // URL docDescription, // document description - "", // author - "", // tags + document.getAuthor(), // author + document.getKeywords(' '), // tags "", // ETag docDate, // modification date new Date(), // loaded date diff --git a/yacy.init b/yacy.init index 0118940db..87fdd69eb 100644 --- a/yacy.init +++ b/yacy.init @@ -503,7 +503,7 @@ filterOutStopwordsFromTopwords=true 80_indexing_busysleep=50 80_indexing_memprereq=4194304 82_crawlstack_idlesleep=5000 -82_crawlstack_busysleep=0 +82_crawlstack_busysleep=10 82_crawlstack_memprereq=1048576 90_cleanup_idlesleep=300000 90_cleanup_busysleep=300000