- fix for xls parser

- fix for image parser - temporary integration of images as document types in the crawler and indexer for testing of the image parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6435 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 52470d0de4
parent 5e8038ac4d
commit 52470d0de4
9 changed files with 222 additions and 191 deletions
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@ -163,7 +163,7 @@ public class IndexControlRWIs_p {
                    index = null;
                }
                if (delurlref) {
-                    for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true);
+                    for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
                }
                if (delurl || delurlref) {
                    for (i = 0; i < urlx.length; i++) {
@ -180,7 +180,7 @@ public class IndexControlRWIs_p {
            // delete selected URLs
            if (post.containsKey("keyhashdelete")) try {
                if (delurlref) {
-                    for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true);
+                    for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
                }
                if (delurl || delurlref) {
                    for (i = 0; i < urlx.length; i++) {
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@ -136,7 +136,7 @@ public class IndexControlURLs_p {
        prop.put("result", " ");
        if (post.containsKey("urlhashdeleteall")) {
-            i = sb.removeAllUrlReferences(segment, urlhash, true);
+            i = segment.removeAllUrlReferences(urlhash, sb.loader, true);
            prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
@ -670,14 +670,16 @@ public class Response {
        // -ranges in request
        // we checked that in shallStoreCache
-        // a picture cannot be indexed
+        // check if pictures can be indexed
        if (responseHeader != null) {
            final String mimeType = responseHeader.mime();
            if (Classification.isPictureMime(mimeType)) { return "Media_Content_(Picture)"; }
            String parserError = TextParser.supportsMime(mimeType);
            if (parserError != null) { return "Media_Content, parser error: " + parserError; }
        }
-        if (Classification.isMediaExtension(url().getFileExtension())) { return "Media_Content_(forbidden)"; }
+        if (Classification.isMediaExtension(url().getFileExtension()) &&
           !Classification.isImageExtension((url().getFileExtension()))) {
            return "Media_Content_(forbidden)";
        }
        // -if-modified-since in request
        // if the page is fresh at the very moment we can index it
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@ -28,14 +28,18 @@ package de.anomic.search;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
 import net.yacy.document.ParserException;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
 import net.yacy.kelondro.data.navigation.NavigationReference;
@ -53,6 +57,7 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
 import net.yacy.kelondro.rwi.ReferenceFactory;
 import net.yacy.kelondro.util.ISO639;
 import net.yacy.repository.Blacklist;
 import net.yacy.repository.LoaderDispatcher;
 import de.anomic.crawler.retrieval.Response;
@ -349,6 +354,70 @@ public class Segment {
        return newEntry;
    }
    // method for index deletion
    public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final boolean fetchOnline) {
        return removeAllUrlReferences(url.hash(), loader, fetchOnline);
    }
    public int removeAllUrlReferences(final String urlhash, LoaderDispatcher loader, final boolean fetchOnline) {
        // find all the words in a specific resource and remove the url reference from every word index
        // finally, delete the url entry
        if (urlhash == null) return 0;
        // determine the url string
        final URIMetadataRow entry = urlMetadata().load(urlhash, null, 0);
        if (entry == null) return 0;
        final URIMetadataRow.Components metadata = entry.metadata();
        if (metadata.url() == null) return 0;
        InputStream resourceContent = null;
        try {
            // get the resource content
            Object[] resource = null;
            try {
                resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
            } catch (IOException e) {
                Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
            }
            if (resource == null) {
                // delete just the url entry
                urlMetadata().remove(urlhash);
                return 0;
            } else {
                resourceContent = (InputStream) resource[0];
                final Long resourceContentLength = (Long) resource[1];
                // parse the resource
                final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
                // get the word set
                Set<String> words = null;
                try {
                    words = new Condenser(document, true, true).words().keySet();
                } catch (final UnsupportedEncodingException e) {
                    e.printStackTrace();
                }
                // delete all word references
                int count = 0;
                if (words != null) count = termIndex().remove(Word.words2hashes(words), urlhash);
                // finally delete the url entry itself
                urlMetadata().remove(urlhash);
                return count;
            }
        } catch (final ParserException e) {
            return 0;
        } catch (IOException e) {
            e.printStackTrace();
            return 0;
        } finally {
            if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
        }
    }
    //  The Cleaner class was provided as "UrldbCleaner" by Hydrox
    public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) {
        return new ReferenceCleaner(startHash);
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -89,7 +89,6 @@ import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.security.NoSuchAlgorithmException;
@ -105,7 +104,6 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
@ -119,6 +117,7 @@ import net.yacy.document.ParserException;
 import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.RSSMessage;
 import net.yacy.document.content.file.SurrogateReader;
 import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.document.parser.xml.RSSFeed;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -1169,7 +1168,6 @@ public final class Switchboard extends serverSwitch {
            noIndexReason = TextParser.supports(response.url(), response.getMimeType());
        }
        // check X-YACY-Index-Control
        // With the X-YACY-Index-Control header set to "no-index" a client could disallow
        // yacy to index the response returned as answer to a request
@ -1583,43 +1581,43 @@ public final class Switchboard extends serverSwitch {
        return new indexingQueueEntry(in.process, in.queueEntry, document, null);
    }
-    private Document parseDocument(Response entry) throws InterruptedException {
+    private Document parseDocument(Response response) throws InterruptedException {
        Document document = null;
-        final EventOrigin processCase = entry.processCase(peers.mySeed().hash);
+        final EventOrigin processCase = response.processCase(peers.mySeed().hash);
        if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase +
-                ", depth=" + entry.depth() +
+                ", depth=" + response.depth() +
-                ", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().depth())) +
+                ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) +
-                ", must-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustMatchPattern().toString()) +
+                ", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) +
-                ", must-not-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustNotMatchPattern().toString()) +
+                ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) +
-                ", initiatorHash=" + entry.initiator() +
+                ", initiatorHash=" + response.initiator() +
                //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
-                ", url=" + entry.url()); // DEBUG
+                ", url=" + response.url()); // DEBUG
        // PARSE CONTENT
        final long parsingStartTime = System.currentTimeMillis();
        byte[] b = null;
        try {
            // fetch the document
-            b = Cache.getContent(entry.url());
+            b = Cache.getContent(response.url());
            if (b == null) {
-                this.log.logWarning("the resource '" + entry.url() + "' is missing in the cache.");
+                this.log.logWarning("the resource '" + response.url() + "' is missing in the cache.");
-                addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), "missing");
+                addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing");
                return null;
            }
        } catch (IOException e) {
-            this.log.logWarning("Unable fetch the resource '" + entry.url() + "'. from the cache: " + e.getMessage());
+            this.log.logWarning("Unable fetch the resource '" + response.url() + "'. from the cache: " + e.getMessage());
-            addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage());
+            addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
            return null;
        }
        try {
            // parse the document
-            document = TextParser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), b);
+            document = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b);
            assert(document != null) : "Unexpected error. Parser returned null.";
        } catch (final ParserException e) {
-            this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage(), e);
+            this.log.logWarning("Unable to parse the resource '" + response.url() + "'. " + e.getMessage(), e);
-            addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage());
+            addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
            if (document != null) {
                document.close();
                document = null;
@ -1630,43 +1628,48 @@ public final class Switchboard extends serverSwitch {
        final long parsingEndTime = System.currentTimeMillis();            
        // get the document date
-        final Date docDate = entry.lastModified();
+        final Date docDate = response.lastModified();
        // put anchors on crawl stack
        final long stackStartTime = System.currentTimeMillis();
        if (
                ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) &&
-                ((entry.profile() == null) || (entry.depth() < entry.profile().depth()))
+                ((response.profile() == null) || (response.depth() < response.profile().depth()))
        ) {
            // get the hyperlinks
            final Map<DigestURI, String> hl = document.getHyperlinks();
-            final Iterator<Map.Entry<DigestURI, String>> i = hl.entrySet().iterator();
+            
            // add all images also to the crawl stack
            for (ImageEntry imageReference : document.getImages().values()) {
                hl.put(imageReference.url(), imageReference.alt());
            }
            // insert those hyperlinks to the crawler
            DigestURI nextUrl;
-            Map.Entry<DigestURI, String> nextEntry;
+            for (Map.Entry<DigestURI, String> nextEntry : hl.entrySet()) {
            while (i.hasNext()) {
                // check for interruption
                checkInterruption();
-                // fetching the next hyperlink
+                // process the next hyperlink
                nextEntry = i.next();
                nextUrl = nextEntry.getKey();
                String u = nextUrl.toNormalform(true, true);
                if (!(u.startsWith("http") || u.startsWith("ftp"))) continue;
                // enqueue the hyperlink into the pre-notice-url db
                crawlStacker.enqueueEntry(new Request(
-                        entry.initiator(),
+                        response.initiator(),
                        nextUrl,
-                        entry.url().hash(),
+                        response.url().hash(),
                        nextEntry.getValue(),
                        null,
                        docDate,
-                        entry.profile().handle(),
+                        response.profile().handle(),
-                        entry.depth() + 1,
+                        response.depth() + 1,
                        0,
                        0
                        ));
            }
            final long stackEndTime = System.currentTimeMillis();
-            if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
+            if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + response.url().toNormalform(false, true) +
                    ", STACKING TIME = " + (stackEndTime-stackStartTime) +
                    ", PARSING TIME = " + (parsingEndTime-parsingStartTime));
        }
@ -1807,68 +1810,6 @@ public final class Switchboard extends serverSwitch {
        }
    }
    // method for index deletion
    public int removeAllUrlReferences(Segment indexSegment, final DigestURI url, final boolean fetchOnline) {
        return removeAllUrlReferences(indexSegment, url.hash(), fetchOnline);
    }
    public int removeAllUrlReferences(Segment indexSegment, final String urlhash, final boolean fetchOnline) {
        // find all the words in a specific resource and remove the url reference from every word index
        // finally, delete the url entry
        if (urlhash == null) return 0;
        // determine the url string
        final URIMetadataRow entry = indexSegment.urlMetadata().load(urlhash, null, 0);
        if (entry == null) return 0;
        final URIMetadataRow.Components metadata = entry.metadata();
        if (metadata.url() == null) return 0;
        InputStream resourceContent = null;
        try {
            // get the resource content
            Object[] resource = null;
            try {
                resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
            } catch (IOException e) {
                Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
            }
            if (resource == null) {
                // delete just the url entry
                indexSegment.urlMetadata().remove(urlhash);
                return 0;
            } else {
                resourceContent = (InputStream) resource[0];
                final Long resourceContentLength = (Long) resource[1];
                // parse the resource
                final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
                // get the word set
                Set<String> words = null;
                try {
                    words = new Condenser(document, true, true).words().keySet();
                } catch (final UnsupportedEncodingException e) {
                    e.printStackTrace();
                }
                // delete all word references
                int count = 0;
                if (words != null) count = indexSegment.termIndex().remove(Word.words2hashes(words), urlhash);
                // finally delete the url entry itself
                indexSegment.urlMetadata().remove(urlhash);
                return count;
            }
        } catch (final ParserException e) {
            return 0;
        } catch (IOException e) {
            e.printStackTrace();
            return 0;
        } finally {
            if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
        }
    }
    public int adminAuthenticated(final RequestHeader requestHeader) {
        // authorization for localhost, only if flag is set to grant localhost access as admin
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -239,13 +239,13 @@ public final class TextParser {
     * check if the parser supports the given content.
     * @param url
     * @param mimeType
-     * @return returns null if the content is supportet. If the content is not supported, return a error string.
+     * @return returns null if the content is supported. If the content is not supported, return a error string.
     */
    public static String supports(final DigestURI url, String mimeType) {
        try {
            // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
-            idiomParser(url, mimeType);
+            List<Idiom> idioms = idiomParser(url, mimeType);
-            return null;
+            return (idioms == null || idioms.size() == 0) ? "no parser found" : null;
        } catch (ParserException e) {
            // in case that a parser is not available, return a error string describing the problem.
            return e.getMessage();
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -327,6 +327,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        return anchors;
    }
    /**
     * get all images
     * @return a map of <urlhash, ImageEntry>
     */
    public HashMap<String, ImageEntry> getImages() {
        // this resturns a String(absolute url)/htmlFilterImageEntry - relation
        return images;
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@ -81,25 +81,26 @@ public class genericImageParser extends AbstractParser implements Idiom {
            throw new ParserException(e.getMessage(), location);
        }
        /*
        // scan the image
        int height = image.getHeight();
        int width = image.getWidth();
        Raster raster = image.getData();
        int[] pixel = raster.getPixel(0, 0, (int[])null);
-        long[] average = new long[]{0, 0, 0};
+        long[] average = new long[pixel.length];
        for (int i = 0; i < average.length; i++) average[i] = 0L;
        int pc = 0;
        for (int x = width / 4; x < 3 * width / 4; x = x + 2) {
            for (int y = height / 4; y < 3 * height / 4; y = y + 2) {
                pixel = raster.getPixel(x, y, pixel);
-                average[0] += pixel[0];
+                for (int i = 0; i < average.length; i++) average[i] += pixel[i];
                average[1] += pixel[1];
                average[2] += pixel[2];
                pc++;
            }
        }
-        
+        */
        // get image properties
        String [] propNames = image.getPropertyNames();
        if (propNames == null) propNames = new String[0];
        StringBuilder sb = new StringBuilder(propNames.length * 80);
        for (String propName: propNames) {
            sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n");
--- a/source/net/yacy/document/parser/xlsParser.java
+++ b/source/net/yacy/document/parser/xlsParser.java
@ -40,20 +40,13 @@ import net.yacy.kelondro.data.meta.DigestURI;
 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
 import org.apache.poi.hssf.eventusermodel.HSSFListener;
 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
 import org.apache.poi.hssf.record.LabelSSTRecord;
 import org.apache.poi.hssf.record.NumberRecord;
 import org.apache.poi.hssf.record.Record;
 import org.apache.poi.hssf.record.SSTRecord;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
+public class xlsParser extends AbstractParser implements Idiom {
    //StringBuilder for parsed text
    private StringBuilder sbFoundStrings = null;
    //sstrecord needed for event parsing
    private SSTRecord sstrec;
    /**
     * a list of mime types that are supported by this parser class
@ -82,12 +75,45 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
     * parses the source documents and returns a plasmaParserDocument containing
     * all extracted information about the parsed document
     */ 
    public Document parse(final DigestURI location, final String mimeType,
            final String charset, final InputStream source) throws ParserException,
            InterruptedException {
        return new XLSHSSFListener().parse(location, mimeType, charset, source);
    }
    public Set<String> supportedMimeTypes() {
        return SUPPORTED_MIME_TYPES;
    }
    public Set<String> supportedExtensions() {
        return SUPPORTED_EXTENSIONS;
    }
    @Override
    public void reset(){
        //nothing to do
        super.reset();
    }
    public class XLSHSSFListener implements HSSFListener {
        //StringBuilder for parsed text
        private final StringBuilder sbFoundStrings;
        public XLSHSSFListener() {
            this.sbFoundStrings = new StringBuilder(100);
        }
        /*
         * parses the source documents and returns a Document containing
         * all extracted information about the parsed document
         */ 
        public Document parse(final DigestURI location, final String mimeType,
                final String charset, final InputStream source) throws ParserException,
                InterruptedException {
            try {
            //generate new StringBuilder for parsing
            sbFoundStrings = new StringBuilder();
                //create a new org.apache.poi.poifs.filesystem.Filesystem
                final POIFSFileSystem poifs = new POIFSFileSystem(source);
@ -133,28 +159,13 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
                 */
                e.printStackTrace();
                final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage();
-            this.theLogger.logSevere(errorMsg);
+                theLogger.logSevere(errorMsg);
                throw new ParserException(errorMsg, location);
        } finally {
            sbFoundStrings = null;
            }
        }
    public Set<String> supportedMimeTypes() {
        return SUPPORTED_MIME_TYPES;
    }
    public Set<String> supportedExtensions() {
        return SUPPORTED_EXTENSIONS;
    }
    @Override
    public void reset(){
        //nothing to do
        super.reset();
    }
        public void processRecord(final Record record) {
            SSTRecord sstrec = null;
            switch (record.getSid()){
                case NumberRecord.sid: {
                    final NumberRecord numrec = (NumberRecord) record;
@ -172,15 +183,18 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
                    }
                    break;
                }
-            
+                /*
                case LabelSSTRecord.sid: {
                    final LabelSSTRecord lsrec = (LabelSSTRecord)record;
                    sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) );
                    break;
                }
                */
            }
            //add line seperator
            sbFoundStrings.append( "\n" );
        }
    }
 }