- added parsing of robots meta-tag in html headers to detect a noindexing request

- added evaluation and indexing prevention in case that a noindexing is given in a html file git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6709 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 54af9e6b49
parent f336ed568d
commit 54af9e6b49
26 changed files with 97 additions and 67 deletions
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -1677,8 +1677,15 @@ public final class Switchboard extends serverSwitch {
    public indexingQueueEntry condenseDocument(final indexingQueueEntry in) {
        in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
        
-        // debug
-        if (log.isFinest()) log.logFinest("CONDENSE "+ in.queueEntry.toString());
+        if (in.document.indexingDenied()) {
+            if (log.isInfo()) log.logInfo("Not Condensed Resource '" + in.queueEntry.url().toNormalform(false, true) + "': denied by document-attached noindexing rule");
+            return new indexingQueueEntry(in.process, in.queueEntry, in.document, null);
+        }
+        
+        if (!in.queueEntry.profile().indexText() && !in.queueEntry.profile().indexMedia()) {
+            if (log.isInfo()) log.logInfo("Not Condensed Resource '" + in.queueEntry.url().toNormalform(false, true) + "': indexing not wanted by crawl profile");
+            return new indexingQueueEntry(in.process, in.queueEntry, in.document, null);
+        }
        
        // strip out words and generate statistics
        if (this.log.isFine()) log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'");
@ -1719,10 +1726,22 @@ public final class Switchboard extends serverSwitch {
        EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash);
        if (process == Segments.Process.SURROGATES) processCase = EventOrigin.SURROGATES;

+        if (condenser == null || document.indexingDenied()) {
+            if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
+            addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case"  + processCase);
+            return;
+        }
+        
+        if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
+            if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase);
+            addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case"  + processCase);
+            return;
+        }
+        
        // remove stopwords
        log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());

-        // STORE URL TO LOADED-URL-DB
+        // STORE WORD INDEX
        URIMetadataRow newEntry = null;
        try {
            newEntry = indexSegments.segment(process).storeDocument(
@ -1747,13 +1766,6 @@ public final class Switchboard extends serverSwitch {
                processCase               // process case
        );
        
-        // STORE WORD INDEX
-        if ((!queueEntry.profile().indexText()) && (!queueEntry.profile().indexMedia())) {
-            if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
-            addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case"  + processCase);
-            return;
-        }
-        
        // increment number of indexed urls
        indexedPages++;
        
--- a/source/de/anomic/yacy/graphics/WebStructureGraph.java
+++ b/source/de/anomic/yacy/graphics/WebStructureGraph.java
@ -131,8 +131,8 @@ public class WebStructureGraph {
        Base64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document
        Base64Order.enhancedCoder.encodeLongSmart(0, 2) +       // count of links to other documents
        Base64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) +   // length of plain text in bytes
-        Base64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
-        Base64Order.enhancedCoder.encodeLongSmart(condenser.words().size(), 3) + // count of all unique words
+        Base64Order.enhancedCoder.encodeLongSmart((condenser == null) ? 0 : condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
+        Base64Order.enhancedCoder.encodeLongSmart((condenser == null) ? 0 : condenser.words().size(), 3) + // count of all unique words
        Base64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
        
        //crl.append(head); crl.append ('|'); crl.append(cpl); crl.append((char) 13); crl.append((char) 10);
--- a/source/net/yacy/document/AbstractParser.java
+++ b/source/net/yacy/document/AbstractParser.java
@ -240,4 +240,5 @@ public abstract class AbstractParser implements Idiom {
    public int hashCode() {
        return this.getName().hashCode();
    }
+
 }
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -76,11 +76,13 @@ public class Document {
    private InputStream textStream;
    private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
    private Set<String> languages;
+    private boolean indexingDenied;
    
-    protected Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
+    public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
                    final String[] keywords, final String title, final String author,
                    final String[] sections, final String abstrct,
-                    final Object text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {
+                    final Object text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images,
+                    boolean indexingDenied) {
        this.source = location;
        this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
        this.charset = charset;
@ -100,6 +102,7 @@ public class Document {
        this.inboundLinks = -1;
        this.outboundLinks = -1;
        this.languages = languages;
+        this.indexingDenied = indexingDenied;
        
        if (text == null) try {
            this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
@ -111,31 +114,6 @@ public class Document {
        }
    }
    
-    public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages) {
-        this(location, mimeType, charset, languages, null, null, null, null, null, (Object)null, null, null);
-    }
-    
-    public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
-                    final String[] keywords, final String title, final String author,
-                    final String[] sections, final String abstrct,
-                    final byte[] text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {
-        this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
-    }
-    
-    public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
-            final String[] keywords, final String title, final String author,
-            final String[] sections, final String abstrct,
-            final File text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {
-        this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
-    }
-    
-    public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
-            final String[] keywords, final String title, final String author,
-            final String[] sections, final String abstrct,
-            final CachedFileOutputStream text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {
-        this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
-    }
-    
    public void setInboundLinks(int il) {
        this.inboundLinks = il;
    }
@ -560,6 +538,10 @@ dc_rights
        return (this.outboundLinks < 0) ? 0 : this.outboundLinks;
    }
    
+    public boolean indexingDenied() {
+        return this.indexingDenied;
+    }
+    
    public void writeXML(OutputStreamWriter os, Date date) throws IOException {
        os.write("<record>\n");
        String title = this.dc_title();
--- a/source/net/yacy/document/content/DCEntry.java
+++ b/source/net/yacy/document/content/DCEntry.java
@ -231,7 +231,8 @@ public class DCEntry extends TreeMap<String, String> {
                "",
                getDescription().getBytes("UTF-8"),
                null,
-                null);
+                null,
+                false);
        } catch (UnsupportedEncodingException e) {
            Log.logException(e);
            return null;
--- a/source/net/yacy/document/parser/csvParser.java
+++ b/source/net/yacy/document/parser/csvParser.java
@ -94,7 +94,8 @@ public class csvParser extends AbstractParser implements Idiom {
                    null,
                    sb.toString().getBytes(charset),
                    null,
-                    null);
+                    null,
+                    false);
        } catch (UnsupportedEncodingException e) {
            throw new ParserException("error in csvParser, getBytes: " + e.getMessage(), location);
        }
--- a/source/net/yacy/document/parser/docParser.java
+++ b/source/net/yacy/document/parser/docParser.java
@ -109,7 +109,8 @@ public class docParser extends AbstractParser implements Idiom {
                      null,
                      contents.toString().getBytes("UTF-8"),
                      null,
-                      null);
+                      null,
+                      false);
        } catch (UnsupportedEncodingException e) {
            throw new ParserException("error in docParser, getBytes: " + e.getMessage(), location);
        }
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -357,6 +357,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    <meta name="DC.type" scheme="DCTERMS.DCMIType" content="Text" />
    */
    
+    public boolean indexingDenied() {
+        String s = metas.get("robots");
+        if (s == null) return false;
+        if (s.indexOf("noindex") >= 0) return true;
+        return false;
+    }
+    
    public String getDescription() {
        String s = metas.get("description");
        if (s == null) s = metas.get("dc.description");
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -152,7 +152,8 @@ public class htmlParser extends AbstractParser implements Idiom {
                scraper.getDescription(),
                scraper.getText(),
                scraper.getAnchors(),
-                scraper.getImages());
+                scraper.getImages(),
+                scraper.indexingDenied());
        //scraper.close();            
        ppd.setFavicon(scraper.getFavicon());
        return ppd;
@ -242,4 +243,7 @@ public class htmlParser extends AbstractParser implements Idiom {
        return SUPPORTED_EXTENSIONS;
    }
    
+    public boolean indexingDenied() {
+        return false;
+    }
 }
--- a/source/net/yacy/document/parser/images/bmpParser.java
+++ b/source/net/yacy/document/parser/images/bmpParser.java
@ -147,7 +147,8 @@ public class bmpParser extends AbstractParser implements Idiom {
             "", // description
             sb.toString().getBytes(), // content text
             anchors, // anchors
-             images); // images
+             images,
+             false); // images
    }

    public static IMAGEMAP parse(final byte[] source) {
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@ -135,7 +135,8 @@ public class genericImageParser extends AbstractParser implements Idiom {
             "", // description
             sb.toString().getBytes(), // content text
             anchors, // anchors
-             images); // images
+             images,
+             false); // images
    }
 /*
 * Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
--- a/source/net/yacy/document/parser/odtParser.java
+++ b/source/net/yacy/document/parser/odtParser.java
@ -201,7 +201,8 @@ public class odtParser extends AbstractParser implements Idiom {
                        docDescription,
                        contentBytes,
                        null,
-                        null);
+                        null,
+                        false);
            } else {
                theDoc = new Document(
                        location,
@ -215,7 +216,8 @@ public class odtParser extends AbstractParser implements Idiom {
                        docDescription,
                        writerFile,
                        null,
-                        null);
+                        null,
+                        false);
            }
            return theDoc;
        } catch (final Exception e) {            
--- a/source/net/yacy/document/parser/ooxmlParser.java
+++ b/source/net/yacy/document/parser/ooxmlParser.java
@ -188,7 +188,8 @@ public class ooxmlParser extends AbstractParser implements Idiom {
                        docDescription,
                        contentBytes,
                        null,
-                        null);
+                        null,
+                        false);
            } else {
                theDoc = new Document(
                        location,
@ -202,7 +203,8 @@ public class ooxmlParser extends AbstractParser implements Idiom {
                        docDescription,
                        writerFile,
                        null,
-                        null);
+                        null,
+                        false);
            }
            return theDoc;
        } catch (final Exception e) {            
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -182,7 +182,8 @@ public class pdfParser extends AbstractParser implements Idiom {
                    null,
                    contentBytes,
                    null,
-                    null);
+                    null,
+                    false);
        } else {
            theDoc = new Document(
                    location,
@ -196,7 +197,8 @@ public class pdfParser extends AbstractParser implements Idiom {
                    null,
                    writerFile,
                    null,
-                    null);                
+                    null,
+                    false);                
        }
        
        return theDoc;
--- a/source/net/yacy/document/parser/pptParser.java
+++ b/source/net/yacy/document/parser/pptParser.java
@ -105,7 +105,8 @@ public class pptParser extends AbstractParser implements Idiom {
                    null,
                    contents.getBytes("UTF-8"),
                    null,
-                    null);
+                    null,
+                    false);
            return theDoc;
        } catch (final Exception e) { 
            if (e instanceof InterruptedException) throw (InterruptedException) e;
--- a/source/net/yacy/document/parser/psParser.java
+++ b/source/net/yacy/document/parser/psParser.java
@ -131,7 +131,8 @@ public class psParser extends AbstractParser implements Idiom {
                    null,
                    outputFile,
                    null,
-                    null);         
+                    null,
+                    false);         
            
            return theDoc;
        } catch (final Exception e) {            
--- a/source/net/yacy/document/parser/rssParser.java
+++ b/source/net/yacy/document/parser/rssParser.java
@ -187,7 +187,8 @@ public class rssParser extends AbstractParser implements Idiom {
                feedDescription,
                text.getBytes(),
                anchors,
-                images);            
+                images,
+                false);            
        // close streams
        try {
            text.close();
--- a/source/net/yacy/document/parser/rtfParser.java
+++ b/source/net/yacy/document/parser/rtfParser.java
@ -89,7 +89,8 @@ public class rtfParser extends AbstractParser implements Idiom {
                    null,
                    bodyText.getBytes("UTF-8"),
                    null,
-                    null);
+                    null,
+                    false);
            
            return theDoc;             
 		}
--- a/source/net/yacy/document/parser/sevenzipParser.java
+++ b/source/net/yacy/document/parser/sevenzipParser.java
@ -71,7 +71,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
    
    public Document parse(final DigestURI location, final String mimeType, final String charset,
            final IInStream source, final long maxRamSize) throws ParserException, InterruptedException {
-        final Document doc = new Document(location, mimeType, charset, null);
+        final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, (Object)null, null, null, false);
        Handler archive;
        super.theLogger.logFine("opening 7zip archive...");
        try {
--- a/source/net/yacy/document/parser/swfParser.java
+++ b/source/net/yacy/document/parser/swfParser.java
@ -135,7 +135,8 @@ public class swfParser extends AbstractParser implements Idiom {
                    abstrct,     // an abstract
                    contents.getBytes("UTF-8"),     // the parsed document text
                    anchors,      // a map of extracted anchors
-                    null);      // a treeset of image URLs
+                    null,
+                    false);      // a treeset of image URLs
            return theDoc;
        } catch (final Exception e) { 
            if (e instanceof InterruptedException) throw (InterruptedException) e;
--- a/source/net/yacy/document/parser/tarParser.java
+++ b/source/net/yacy/document/parser/tarParser.java
@ -131,7 +131,7 @@ public class tarParser extends AbstractParser implements Idiom {
                final String entryName = entry.getName();
                
                // getting the entry file extension
-                final int idx = entryName.lastIndexOf(".");
+                final int idx = entryName.lastIndexOf('.');
                final String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
                
                // trying to determine the mimeType per file extension   
@ -198,7 +198,8 @@ public class tarParser extends AbstractParser implements Idiom {
                    docAbstrct.toString(),
                    ((ByteBuffer)docText).getBytes(),
                    docAnchors,
-                    docImages);
+                    docImages,
+                    false);
            } else {
                result = new Document(
                        location,
@ -212,7 +213,8 @@ public class tarParser extends AbstractParser implements Idiom {
                        docAbstrct.toString(),
                        outputFile,
                        docAnchors,
-                        docImages);                
+                        docImages,
+                        false);                
            }
            
            return result;
--- a/source/net/yacy/document/parser/torrentParser.java
+++ b/source/net/yacy/document/parser/torrentParser.java
@ -122,7 +122,8 @@ public class torrentParser extends AbstractParser implements Idiom {
                    null,
                    filenames.toString().getBytes(charset),
                    null,
-                    null);
+                    null,
+                    false);
        } catch (UnsupportedEncodingException e) {
            throw new ParserException("error in torrentParser, getBytes: " + e.getMessage(), location);
        }
--- a/source/net/yacy/document/parser/vcfParser.java
+++ b/source/net/yacy/document/parser/vcfParser.java
@ -235,7 +235,8 @@ public class vcfParser extends AbstractParser implements Idiom {
                    "vCard",                    // an abstract
                    text,                       // the parsed document text
                    anchors,                    // a map of extracted anchors
-                    null);                      // a treeset of image URLs
+                    null,                       // a treeset of image URLs
+                    false);                      
            return theDoc;
        } catch (final Exception e) { 
            if (e instanceof InterruptedException) throw (InterruptedException) e;
--- a/source/net/yacy/document/parser/vsdParser.java
+++ b/source/net/yacy/document/parser/vsdParser.java
@ -132,7 +132,8 @@ public class vsdParser extends AbstractParser implements Idiom {
                    abstrct,      // an abstract
                    contents.getBytes("UTF-8"),     // the parsed document text
                    null,         // a map of extracted anchors
-                    null);        // a treeset of image URLs
+                    null,         // a treeset of image URLs
+                    false);
            return theDoc;
        } catch (final Exception e) { 
            if (e instanceof InterruptedException) throw (InterruptedException) e;
--- a/source/net/yacy/document/parser/xlsParser.java
+++ b/source/net/yacy/document/parser/xlsParser.java
@ -150,7 +150,8 @@ public class xlsParser extends AbstractParser implements Idiom {
                        null,
                        contents.getBytes("UTF-8"),
                        null,
-                        null);
+                        null,
+                        false);
                return theDoc;
            } catch (final Exception e) { 
                if (e instanceof InterruptedException) throw (InterruptedException) e;
--- a/source/net/yacy/document/parser/zipParser.java
+++ b/source/net/yacy/document/parser/zipParser.java
@ -118,7 +118,7 @@ public class zipParser extends AbstractParser implements Idiom {
                
                // Get the entry name
                final String entryName = entry.getName();                
-                final int idx = entryName.lastIndexOf(".");
+                final int idx = entryName.lastIndexOf('.');
                
                // getting the file extension
                final String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
@ -185,7 +185,8 @@ public class zipParser extends AbstractParser implements Idiom {
                    docAbstrct.toString(),
                    ((ByteBuffer)docText).getBytes(),
                    docAnchors,
-                    docImages);
+                    docImages,
+                    false);
            } else {
                result = new Document(
                        location,
@ -199,7 +200,8 @@ public class zipParser extends AbstractParser implements Idiom {
                        docAbstrct.toString(),
                        outputFile,
                        docAnchors,
-                        docImages);                
+                        docImages,
+                        false);                
            }
            
            return result;