added author tag to indexing content

enhanced composition of title tag TODO: insert author information for external parsers git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3488 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · a738b57b31
parent 6be57983a8
commit a738b57b31
19 changed files with 68 additions and 18 deletions
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -238,7 +238,8 @@ public class yacysearch {
                    map.put("url", comp.url().toNormalform().replace(',', '|'));
                    map.put("title", comp.descr().replace(',', ' '));
                    map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' '));
-                    map.put("tags",  ((document == null) ? "" : document.getKeywords(' ')));
+                    map.put("author", ((document == null) ? "" : document.getAuthor()));
+                    map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
                    yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
                    document.close();
                }
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -275,8 +275,22 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
    
    public String getTitle() {
        // construct a title string, even if the document has no title
-        // if there is one, return it
-        if (title.length() > 0) return title;
+        
+        // some documents have a title tag as meta tag
+        String s = (String) metas.get("title");
+        
+        // try to construct the title with the content of the title tag
+        if (title.length() > 0) {
+            if (s == null) {
+                return title;
+            } else {
+                if ((title.compareToIgnoreCase(s) == 0) || (title.indexOf(s) >= 0)) return s; else return title + ": " + s;
+            }
+        } else {
+            if (s != null) {
+                return s;
+            }
+        }
        
        // othervise take any headline
        for (int i = 0; i < 4; i++) {
@ -284,7 +298,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        }
        
        // take description tag
-        String s = getDescription();
+        s = getDescription();
        if (s.length() > 0) return s;
        
        // extract headline from content
@ -336,9 +350,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        if (s == null) return ""; else return s;
    }
    
-    public String getCopyright() {
-        String s = (String) metas.get("copyright");
-        if (s == null) return ""; else return s;
+    public String getAuthor() {
+        String s = (String) metas.get("author");
+        if (s == null) s = (String) metas.get("copyright");
+        if (s == null) return "";
+        return s;
    }
    
    public String[] getContentLanguages() {
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@ -97,6 +97,7 @@ implements Parser {
                          replaceAll("\r"," ").
                          replaceAll("\t"," "),
                      null,
+                      "", // TODO: AUTHOR
                      null,
                      null,
                      contents.getBytes("UTF-8"),
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@ -173,6 +173,7 @@ public class odtParser extends AbstractParser implements Parser {
                        docKeywords,
                        docShortTitle, 
                        docLongTitle,
+                        "", // TODO: AUTHOR
                        null,
                        docDescription,
                        contentBytes,
@ -186,6 +187,7 @@ public class odtParser extends AbstractParser implements Parser {
                        docKeywords,
                        docShortTitle, 
                        docLongTitle,
+                        "", // TODO: AUTHOR
                        null,
                        docDescription,
                        writerFile,
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@ -158,6 +158,7 @@ public class pdfParser extends AbstractParser implements Parser {
                        docKeywords,
                        docSubject,
                        docTitle,
+                        "", // TODO: AUTHOR
                        null,
                        null,
                        contentBytes,
@ -171,6 +172,7 @@ public class pdfParser extends AbstractParser implements Parser {
                        docKeywords,
                        docSubject,
                        docTitle,
+                        "", // TODO: AUTHOR
                        null,
                        null,
                        writerFile,
--- a/source/de/anomic/plasma/parser/ppt/pptParser.java
+++ b/source/de/anomic/plasma/parser/ppt/pptParser.java
@ -114,6 +114,7 @@ public class pptParser extends AbstractParser implements Parser {
                    replaceAll("\r"," ").
                    replaceAll("\t"," "),
                    null,
+                    "", // TODO: AUTHOR
                    null,
                    null,
                    contents.getBytes("UTF-8"),
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@ -152,6 +152,7 @@ public class rpmParser extends AbstractParser implements Parser {
                    null,
                    name,
                    summary,
+                    "", // TODO: AUTHOR
                    null,
                    description,
                    content.toString().getBytes("UTF-8"),
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@ -190,6 +190,7 @@ public class rssParser extends AbstractParser implements Parser {
                    null,
                    null,
                    feedTitle,
+                    "", // TODO: AUTHOR
                    (String[]) feedSections.toArray(new String[feedSections.size()]),
                    feedDescription,
                    text.getBytes(),
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@ -102,6 +102,7 @@ implements Parser {
                        replaceAll("\r"," ").
                        replaceAll("\t"," "),
                    null,
+                    "", // TODO: AUTHOR
                    null,
                    null,
                    bodyText.getBytes("UTF-8"),
--- a/source/de/anomic/plasma/parser/swf/swfParser.java
+++ b/source/de/anomic/plasma/parser/swf/swfParser.java
@ -135,6 +135,7 @@ public class swfParser extends AbstractParser implements Parser {
                          replaceAll("\r"," ").
                          replaceAll("\t"," "), //short title
                    longTitle,    // a long document title
+                    "", // TODO: AUTHOR
                    sections,     // an array of section headlines
                    abstrct,     // an abstract
                    contents.getBytes("UTF-8"),     // the parsed document text
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@ -213,6 +213,7 @@ public class tarParser extends AbstractParser implements Parser {
                    docKeywords.toString().split(" |,"),
                    docShortTitle.toString(), 
                    docLongTitle.toString(),
+                    "", // TODO: AUTHOR
                    (String[])docSections.toArray(new String[docSections.size()]),
                    docAbstrct.toString(),
                    ((serverByteBuffer)docText).toByteArray(),
@ -226,6 +227,7 @@ public class tarParser extends AbstractParser implements Parser {
                        docKeywords.toString().split(" |,"),
                        docShortTitle.toString(), 
                        docLongTitle.toString(),
+                        "", // TODO: AUTHOR
                        (String[])docSections.toArray(new String[docSections.size()]),
                        docAbstrct.toString(),
                        outputFile,
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@ -247,6 +247,7 @@ public class vcfParser extends AbstractParser implements Parser {
                    null,                       // a list of extracted keywords
                    null,                       // a short document title
                    parsedTitle.toString(),     // a long document title
+                    "",                         // TODO: AUTHOR
                    sections,                   // an array of section headlines
                    "vCard",                    // an abstract
                    text,                       // the parsed document text
--- a/source/de/anomic/plasma/parser/xls/xlsParser.java
+++ b/source/de/anomic/plasma/parser/xls/xlsParser.java
@ -142,6 +142,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener {
                    replaceAll("\r"," ").
                    replaceAll("\t"," "),
                    null,
+                    "", // TODO: AUTHOR
                    null,
                    null,
                    contents.getBytes("UTF-8"),
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@ -197,6 +197,7 @@ public class zipParser extends AbstractParser implements Parser {
                    docKeywords.toString().split(" |,"),
                    docShortTitle.toString(), 
                    docLongTitle.toString(),
+                    "", // TODO: AUTHOR
                    (String[])docSections.toArray(new String[docSections.size()]),
                    docAbstrct.toString(),
                    ((serverByteBuffer)docText).toByteArray(),
@ -210,6 +211,7 @@ public class zipParser extends AbstractParser implements Parser {
                        docKeywords.toString().split(" |,"),
                        docShortTitle.toString(), 
                        docLongTitle.toString(),
+                        "", // TODO: AUTHOR
                        (String[])docSections.toArray(new String[docSections.size()]),
                        docAbstrct.toString(),
                        outputFile,
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -154,7 +154,7 @@ public final class plasmaCondenser {
            insertTextToWords(document.getMainLongTitle(),  1, indexRWIEntryNew.flag_app_descr, wflags);
            insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
            insertTextToWords(document.getAbstract(),       3, indexRWIEntryNew.flag_app_descr, wflags);
-            // missing: author!
+            insertTextToWords(document.getAuthor(),         4, indexRWIEntryNew.flag_app_descr, wflags);
            // missing: tags!
            String[] titles = document.getSectionTitles();
            for (int i = 0; i < titles.length; i++) {
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -753,11 +753,19 @@ public final class plasmaParser {
            String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
            int p = 0;
            for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
-            plasmaParserDocument ppd =  new plasmaParserDocument(new URL(location.toNormalform()),
-                                mimeType, charSet, scraper.getKeywords(),
-                                scraper.getTitle(), scraper.getTitle(),
-                                sections, scraper.getDescription(),
-                                scraper.getText(), scraper.getAnchors(), scraper.getImages());
+            plasmaParserDocument ppd =  new plasmaParserDocument(
+                    new URL(location.toNormalform()),
+                    mimeType,
+                    charSet,
+                    scraper.getKeywords(),
+                    scraper.getTitle(),
+                    scraper.getTitle(),
+                    scraper.getAuthor(),
+                    sections,
+                    scraper.getDescription(),
+                    scraper.getText(),
+                    scraper.getAnchors(),
+                    scraper.getImages());
            //scraper.close();
            return ppd;
        } catch (MalformedURLException e) {
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -66,6 +66,7 @@ public class plasmaParserDocument {
    private String[] keywords;  // most resources provide a keyword field
    private String shortTitle;  // a shortTitle mostly appears in the window header (border)
    private String longTitle;   // the real title of the document, commonly h1-tags
+    private String author;      // author or copyright
    private String[] sections;  // if present: more titles/headlines appearing in the document
    private String abstrct;     // an abstract, if present: short content description
    private Object text;  // the clear text, all that is visible
@ -80,7 +81,7 @@ public class plasmaParserDocument {
    private InputStream textStream; 
                    
    public plasmaParserDocument(URL location, String mimeType, String charset,
-                    String[] keywords, String shortTitle, String longTitle,
+                    String[] keywords, String shortTitle, String longTitle, String author,
                    String[] sections, String abstrct,
                    byte[] text, Map anchors, TreeSet images) {
        this.location = location;
@ -89,6 +90,7 @@ public class plasmaParserDocument {
        this.keywords = (keywords==null) ? new String[0] : keywords;
        this.shortTitle = (shortTitle==null)?"":shortTitle;
        this.longTitle = (longTitle==null)?"":longTitle;
+        this.author = (author==null)?"":author;
        this.sections = (sections==null)?new String[0]:sections;
        this.abstrct = (abstrct==null)?"":abstrct;
        this.text = (text==null)?new byte[0]:text;
@ -103,7 +105,7 @@ public class plasmaParserDocument {
    }
    
    public plasmaParserDocument(URL location, String mimeType, String charset,
-            String[] keywords, String shortTitle, String longTitle,
+            String[] keywords, String shortTitle, String longTitle, String author,
            String[] sections, String abstrct,
            File text, Map anchors, TreeSet images) {
        this.location = location;
@ -112,6 +114,7 @@ public class plasmaParserDocument {
        this.keywords = (keywords==null) ? new String[0] : keywords;
        this.shortTitle = (shortTitle==null)?"":shortTitle;
        this.longTitle = (longTitle==null)?"":longTitle;
+        this.author = (author==null)?"":author;
        this.sections = (sections==null)?new String[0]:sections;
        this.abstrct = (abstrct==null)?"":abstrct;
        this.text = text;
@ -157,6 +160,10 @@ public class plasmaParserDocument {
        if (abstrct != null) return abstrct; else return getMainLongTitle();
    }
    
+    public String getAuthor() {
+        if (author != null) return author; else return "";
+    }
+    
    public InputStream getText() {
        try {
            if (this.text == null) return null;
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1645,6 +1645,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                return false;
            }

+            if ((sbQueue.size() == 0) && ((getThread(CRAWLJOB_LOCAL_CRAWL).getJobCount() == 0))) setPerformance((int) Math.max(120, 60000 / getConfigLong(INDEX_DIST_BUSYSLEEP, 6000))); // if there is no activity, set low performance
+            
            // flush some entries from the RAM cache
            if (sbQueue.size() == 0) wordIndex.flushCacheSome(); // permanent flushing only if we are not busy
            wordIndex.loadedURL.flushCacheSome();
@ -2227,8 +2229,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                    indexURLEntry newEntry = wordIndex.loadedURL.newEntry(
                            entry.url(),                               // URL
                            docDescription,                            // document description
-                            "",                                        // author
-                            "",                                        // tags
+                            document.getAuthor(),                      // author
+                            document.getKeywords(' '),                 // tags
                            "",                                        // ETag
                            docDate,                                   // modification date
                            new Date(),                                // loaded date
--- a/yacy.init
+++ b/yacy.init
@ -503,7 +503,7 @@ filterOutStopwordsFromTopwords=true
 80_indexing_busysleep=50
 80_indexing_memprereq=4194304
 82_crawlstack_idlesleep=5000
-82_crawlstack_busysleep=0
+82_crawlstack_busysleep=10
 82_crawlstack_memprereq=1048576
 90_cleanup_idlesleep=300000
 90_cleanup_busysleep=300000