- fix for xls parser

- fix for image parser - temporary integration of images as document types in the crawler and indexer for testing of the image parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6435 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 52470d0de4
parent 5e8038ac4d
commit 52470d0de4
9 changed files with 222 additions and 191 deletions
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@ -163,7 +163,7 @@ public class IndexControlRWIs_p {
                    index = null;
                }
                if (delurlref) {
-                    for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true);
+                    for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
                }
                if (delurl || delurlref) {
                    for (i = 0; i < urlx.length; i++) {
@ -180,7 +180,7 @@ public class IndexControlRWIs_p {
            // delete selected URLs
            if (post.containsKey("keyhashdelete")) try {
                if (delurlref) {
-                    for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true);
+                    for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
                }
                if (delurl || delurlref) {
                    for (i = 0; i < urlx.length; i++) {
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@ -136,7 +136,7 @@ public class IndexControlURLs_p {
        prop.put("result", " ");

        if (post.containsKey("urlhashdeleteall")) {
-            i = sb.removeAllUrlReferences(segment, urlhash, true);
+            i = segment.removeAllUrlReferences(urlhash, sb.loader, true);
            prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
@ -670,14 +670,16 @@ public class Response {
        // -ranges in request
        // we checked that in shallStoreCache

-        // a picture cannot be indexed
+        // check if pictures can be indexed
        if (responseHeader != null) {
            final String mimeType = responseHeader.mime();
-            if (Classification.isPictureMime(mimeType)) { return "Media_Content_(Picture)"; }
            String parserError = TextParser.supportsMime(mimeType);
            if (parserError != null) { return "Media_Content, parser error: " + parserError; }
        }
-        if (Classification.isMediaExtension(url().getFileExtension())) { return "Media_Content_(forbidden)"; }
+        if (Classification.isMediaExtension(url().getFileExtension()) &&
+           !Classification.isImageExtension((url().getFileExtension()))) {
+            return "Media_Content_(forbidden)";
+        }

        // -if-modified-since in request
        // if the page is fresh at the very moment we can index it
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@ -28,14 +28,18 @@ package de.anomic.search;

 import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeSet;

 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
+import net.yacy.document.ParserException;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
 import net.yacy.kelondro.data.navigation.NavigationReference;
@ -53,6 +57,7 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
 import net.yacy.kelondro.rwi.ReferenceFactory;
 import net.yacy.kelondro.util.ISO639;
 import net.yacy.repository.Blacklist;
+import net.yacy.repository.LoaderDispatcher;

 import de.anomic.crawler.retrieval.Response;

@ -349,6 +354,70 @@ public class Segment {
        return newEntry;
    }
    
+
+    // method for index deletion
+    public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final boolean fetchOnline) {
+        return removeAllUrlReferences(url.hash(), loader, fetchOnline);
+    }
+    
+    public int removeAllUrlReferences(final String urlhash, LoaderDispatcher loader, final boolean fetchOnline) {
+        // find all the words in a specific resource and remove the url reference from every word index
+        // finally, delete the url entry
+        
+        if (urlhash == null) return 0;
+        // determine the url string
+        final URIMetadataRow entry = urlMetadata().load(urlhash, null, 0);
+        if (entry == null) return 0;
+        final URIMetadataRow.Components metadata = entry.metadata();
+        if (metadata.url() == null) return 0;
+        
+        InputStream resourceContent = null;
+        try {
+            // get the resource content
+            Object[] resource = null;
+            try {
+                resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
+            } catch (IOException e) {
+                Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
+            }
+            if (resource == null) {
+                // delete just the url entry
+                urlMetadata().remove(urlhash);
+                return 0;
+            } else {
+                resourceContent = (InputStream) resource[0];
+                final Long resourceContentLength = (Long) resource[1];
+                
+                // parse the resource
+                final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
+                
+                // get the word set
+                Set<String> words = null;
+                try {
+                    words = new Condenser(document, true, true).words().keySet();
+                } catch (final UnsupportedEncodingException e) {
+                    e.printStackTrace();
+                }
+                
+                // delete all word references
+                int count = 0;
+                if (words != null) count = termIndex().remove(Word.words2hashes(words), urlhash);
+                
+                // finally delete the url entry itself
+                urlMetadata().remove(urlhash);
+                return count;
+            }
+        } catch (final ParserException e) {
+            return 0;
+        } catch (IOException e) {
+            e.printStackTrace();
+            return 0;
+        } finally {
+            if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
+        }
+    }
+
+    
    //  The Cleaner class was provided as "UrldbCleaner" by Hydrox
    public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) {
        return new ReferenceCleaner(startHash);
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -89,7 +89,6 @@ import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
-import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.security.NoSuchAlgorithmException;
@ -105,7 +104,6 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
-import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
@ -119,6 +117,7 @@ import net.yacy.document.ParserException;
 import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.RSSMessage;
 import net.yacy.document.content.file.SurrogateReader;
+import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.document.parser.xml.RSSFeed;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -1169,7 +1168,6 @@ public final class Switchboard extends serverSwitch {
            noIndexReason = TextParser.supports(response.url(), response.getMimeType());
        }

-        
        // check X-YACY-Index-Control
        // With the X-YACY-Index-Control header set to "no-index" a client could disallow
        // yacy to index the response returned as answer to a request
@ -1583,43 +1581,43 @@ public final class Switchboard extends serverSwitch {
        return new indexingQueueEntry(in.process, in.queueEntry, document, null);
    }
    
-    private Document parseDocument(Response entry) throws InterruptedException {
+    private Document parseDocument(Response response) throws InterruptedException {
        Document document = null;
-        final EventOrigin processCase = entry.processCase(peers.mySeed().hash);
+        final EventOrigin processCase = response.processCase(peers.mySeed().hash);
        
        if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase +
-                ", depth=" + entry.depth() +
-                ", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().depth())) +
-                ", must-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustMatchPattern().toString()) +
-                ", must-not-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustNotMatchPattern().toString()) +
-                ", initiatorHash=" + entry.initiator() +
+                ", depth=" + response.depth() +
+                ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) +
+                ", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) +
+                ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) +
+                ", initiatorHash=" + response.initiator() +
                //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
-                ", url=" + entry.url()); // DEBUG
+                ", url=" + response.url()); // DEBUG
        
        // PARSE CONTENT
        final long parsingStartTime = System.currentTimeMillis();
        byte[] b = null;
        try {
            // fetch the document
-            b = Cache.getContent(entry.url());
+            b = Cache.getContent(response.url());
            if (b == null) {
-                this.log.logWarning("the resource '" + entry.url() + "' is missing in the cache.");
-                addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), "missing");
+                this.log.logWarning("the resource '" + response.url() + "' is missing in the cache.");
+                addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing");
                return null;
            }
        } catch (IOException e) {
-            this.log.logWarning("Unable fetch the resource '" + entry.url() + "'. from the cache: " + e.getMessage());
-            addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage());
+            this.log.logWarning("Unable fetch the resource '" + response.url() + "'. from the cache: " + e.getMessage());
+            addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
            return null;
        }
        
        try {
            // parse the document
-            document = TextParser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), b);
+            document = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b);
            assert(document != null) : "Unexpected error. Parser returned null.";
        } catch (final ParserException e) {
-            this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage(), e);
-            addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage());
+            this.log.logWarning("Unable to parse the resource '" + response.url() + "'. " + e.getMessage(), e);
+            addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
            if (document != null) {
                document.close();
                document = null;
@ -1630,43 +1628,48 @@ public final class Switchboard extends serverSwitch {
        final long parsingEndTime = System.currentTimeMillis();            
        
        // get the document date
-        final Date docDate = entry.lastModified();
+        final Date docDate = response.lastModified();
        
        // put anchors on crawl stack
        final long stackStartTime = System.currentTimeMillis();
        if (
                ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) &&
-                ((entry.profile() == null) || (entry.depth() < entry.profile().depth()))
+                ((response.profile() == null) || (response.depth() < response.profile().depth()))
        ) {
+            // get the hyperlinks
            final Map<DigestURI, String> hl = document.getHyperlinks();
-            final Iterator<Map.Entry<DigestURI, String>> i = hl.entrySet().iterator();
+            
+            // add all images also to the crawl stack
+            for (ImageEntry imageReference : document.getImages().values()) {
+                hl.put(imageReference.url(), imageReference.alt());
+            }
+            
+            // insert those hyperlinks to the crawler
            DigestURI nextUrl;
-            Map.Entry<DigestURI, String> nextEntry;
-            while (i.hasNext()) {
+            for (Map.Entry<DigestURI, String> nextEntry : hl.entrySet()) {
                // check for interruption
                checkInterruption();
                
-                // fetching the next hyperlink
-                nextEntry = i.next();
+                // process the next hyperlink
                nextUrl = nextEntry.getKey();
                String u = nextUrl.toNormalform(true, true);
                if (!(u.startsWith("http") || u.startsWith("ftp"))) continue;
                // enqueue the hyperlink into the pre-notice-url db
                crawlStacker.enqueueEntry(new Request(
-                        entry.initiator(),
+                        response.initiator(),
                        nextUrl,
-                        entry.url().hash(),
+                        response.url().hash(),
                        nextEntry.getValue(),
                        null,
                        docDate,
-                        entry.profile().handle(),
-                        entry.depth() + 1,
+                        response.profile().handle(),
+                        response.depth() + 1,
                        0,
                        0
                        ));
            }
            final long stackEndTime = System.currentTimeMillis();
-            if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
+            if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + response.url().toNormalform(false, true) +
                    ", STACKING TIME = " + (stackEndTime-stackStartTime) +
                    ", PARSING TIME = " + (parsingEndTime-parsingStartTime));
        }
@ -1807,68 +1810,6 @@ public final class Switchboard extends serverSwitch {
        }
    }
    
-    // method for index deletion
-    public int removeAllUrlReferences(Segment indexSegment, final DigestURI url, final boolean fetchOnline) {
-        return removeAllUrlReferences(indexSegment, url.hash(), fetchOnline);
-    }
-    
-    public int removeAllUrlReferences(Segment indexSegment, final String urlhash, final boolean fetchOnline) {
-        // find all the words in a specific resource and remove the url reference from every word index
-        // finally, delete the url entry
-        
-        if (urlhash == null) return 0;
-        // determine the url string
-        final URIMetadataRow entry = indexSegment.urlMetadata().load(urlhash, null, 0);
-        if (entry == null) return 0;
-        final URIMetadataRow.Components metadata = entry.metadata();
-        if (metadata.url() == null) return 0;
-        
-        InputStream resourceContent = null;
-        try {
-            // get the resource content
-            Object[] resource = null;
-            try {
-                resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
-            } catch (IOException e) {
-                Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
-            }
-            if (resource == null) {
-                // delete just the url entry
-                indexSegment.urlMetadata().remove(urlhash);
-                return 0;
-            } else {
-                resourceContent = (InputStream) resource[0];
-                final Long resourceContentLength = (Long) resource[1];
-                
-                // parse the resource
-                final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
-                
-                // get the word set
-                Set<String> words = null;
-                try {
-                    words = new Condenser(document, true, true).words().keySet();
-                } catch (final UnsupportedEncodingException e) {
-                    e.printStackTrace();
-                }
-                
-                // delete all word references
-                int count = 0;
-                if (words != null) count = indexSegment.termIndex().remove(Word.words2hashes(words), urlhash);
-                
-                // finally delete the url entry itself
-                indexSegment.urlMetadata().remove(urlhash);
-                return count;
-            }
-        } catch (final ParserException e) {
-            return 0;
-        } catch (IOException e) {
-            e.printStackTrace();
-            return 0;
-        } finally {
-            if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
-        }
-    }
-
    public int adminAuthenticated(final RequestHeader requestHeader) {
        
        // authorization for localhost, only if flag is set to grant localhost access as admin
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -239,13 +239,13 @@ public final class TextParser {
     * check if the parser supports the given content.
     * @param url
     * @param mimeType
-     * @return returns null if the content is supportet. If the content is not supported, return a error string.
+     * @return returns null if the content is supported. If the content is not supported, return a error string.
     */
    public static String supports(final DigestURI url, String mimeType) {
        try {
            // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
-            idiomParser(url, mimeType);
-            return null;
+            List<Idiom> idioms = idiomParser(url, mimeType);
+            return (idioms == null || idioms.size() == 0) ? "no parser found" : null;
        } catch (ParserException e) {
            // in case that a parser is not available, return a error string describing the problem.
            return e.getMessage();
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -327,6 +327,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        return anchors;
    }

+    /**
+     * get all images
+     * @return a map of <urlhash, ImageEntry>
+     */
    public HashMap<String, ImageEntry> getImages() {
        // this resturns a String(absolute url)/htmlFilterImageEntry - relation
        return images;
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@ -81,25 +81,26 @@ public class genericImageParser extends AbstractParser implements Idiom {
            throw new ParserException(e.getMessage(), location);
        }
        
+        /*
        // scan the image
        int height = image.getHeight();
        int width = image.getWidth();
        Raster raster = image.getData();
        int[] pixel = raster.getPixel(0, 0, (int[])null);
-        long[] average = new long[]{0, 0, 0};
+        long[] average = new long[pixel.length];
+        for (int i = 0; i < average.length; i++) average[i] = 0L;
        int pc = 0;
        for (int x = width / 4; x < 3 * width / 4; x = x + 2) {
            for (int y = height / 4; y < 3 * height / 4; y = y + 2) {
                pixel = raster.getPixel(x, y, pixel);
-                average[0] += pixel[0];
-                average[1] += pixel[1];
-                average[2] += pixel[2];
+                for (int i = 0; i < average.length; i++) average[i] += pixel[i];
                pc++;
            }
        }
-        
+        */
        // get image properties
        String [] propNames = image.getPropertyNames();
+        if (propNames == null) propNames = new String[0];
        StringBuilder sb = new StringBuilder(propNames.length * 80);
        for (String propName: propNames) {
            sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n");
--- a/source/net/yacy/document/parser/xlsParser.java
+++ b/source/net/yacy/document/parser/xlsParser.java
@ -40,20 +40,13 @@ import net.yacy.kelondro.data.meta.DigestURI;
 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
 import org.apache.poi.hssf.eventusermodel.HSSFListener;
 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
-import org.apache.poi.hssf.record.LabelSSTRecord;
 import org.apache.poi.hssf.record.NumberRecord;
 import org.apache.poi.hssf.record.Record;
 import org.apache.poi.hssf.record.SSTRecord;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;


-public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
-
-    //StringBuilder for parsed text
-    private StringBuilder sbFoundStrings = null;
-    
-    //sstrecord needed for event parsing
-    private SSTRecord sstrec;
+public class xlsParser extends AbstractParser implements Idiom {
    
    /**
     * a list of mime types that are supported by this parser class
@ -85,59 +78,7 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
    public Document parse(final DigestURI location, final String mimeType,
            final String charset, final InputStream source) throws ParserException,
            InterruptedException {
-        try {
-            //generate new StringBuilder for parsing
-            sbFoundStrings = new StringBuilder();
-            
-            //create a new org.apache.poi.poifs.filesystem.Filesystem
-            final POIFSFileSystem poifs = new POIFSFileSystem(source);
-            //get the Workbook (excel part) stream in a InputStream
-            final InputStream din = poifs.createDocumentInputStream("Workbook");
-            //construct out HSSFRequest object
-            final HSSFRequest req = new HSSFRequest();
-            //lazy listen for ALL records with the listener shown above
-            req.addListenerForAllRecords(this);
-            //create our event factory
-            final HSSFEventFactory factory = new HSSFEventFactory();
-            //process our events based on the document input stream
-            factory.processEvents(req, din);
-            //close our document input stream (don't want to leak these!)
-            din.close();
-            
-            //now the parsed strings are in the StringBuilder, now convert them to a String
-            final String contents = sbFoundStrings.toString().trim();
-            
-            /*
-             * create the plasmaParserDocument for the database
-             * and set shortText and bodyText properly
-             */
-            final Document theDoc = new Document(
-                    location,
-                    mimeType,
-                    "UTF-8",
-                    null,
-                    null,
-                    location.getFile(),
-                    "", // TODO: AUTHOR
-                    null,
-                    null,
-                    contents.getBytes("UTF-8"),
-                    null,
-                    null);
-            return theDoc;
-        } catch (final Exception e) { 
-            if (e instanceof InterruptedException) throw (InterruptedException) e;
-
-            /*
-             * an unexpected error occurred, log it and throw a ParserException
-             */
-            e.printStackTrace();
-            final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage();
-            this.theLogger.logSevere(errorMsg);
-            throw new ParserException(errorMsg, location);
-        } finally {
-            sbFoundStrings = null;
-        }
+        return new XLSHSSFListener().parse(location, mimeType, charset, source);
    }
    
    public Set<String> supportedMimeTypes() {
@ -153,34 +94,107 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
        //nothing to do
        super.reset();
    }
+    
+    
+    public class XLSHSSFListener implements HSSFListener {
+
+        //StringBuilder for parsed text
+        private final StringBuilder sbFoundStrings;
+        

-    public void processRecord(final Record record) {
-        switch (record.getSid()){
-            case NumberRecord.sid: {
-                final NumberRecord numrec = (NumberRecord) record;
-                sbFoundStrings.append(numrec.getValue());
-                break;
+        public XLSHSSFListener() {
+            this.sbFoundStrings = new StringBuilder(100);
+        }
+
+        /*
+         * parses the source documents and returns a Document containing
+         * all extracted information about the parsed document
+         */ 
+        public Document parse(final DigestURI location, final String mimeType,
+                final String charset, final InputStream source) throws ParserException,
+                InterruptedException {
+            try {
+                
+                //create a new org.apache.poi.poifs.filesystem.Filesystem
+                final POIFSFileSystem poifs = new POIFSFileSystem(source);
+                //get the Workbook (excel part) stream in a InputStream
+                final InputStream din = poifs.createDocumentInputStream("Workbook");
+                //construct out HSSFRequest object
+                final HSSFRequest req = new HSSFRequest();
+                //lazy listen for ALL records with the listener shown above
+                req.addListenerForAllRecords(this);
+                //create our event factory
+                final HSSFEventFactory factory = new HSSFEventFactory();
+                //process our events based on the document input stream
+                factory.processEvents(req, din);
+                //close our document input stream (don't want to leak these!)
+                din.close();
+                
+                //now the parsed strings are in the StringBuilder, now convert them to a String
+                final String contents = sbFoundStrings.toString().trim();
+                
+                /*
+                 * create the plasmaParserDocument for the database
+                 * and set shortText and bodyText properly
+                 */
+                final Document theDoc = new Document(
+                        location,
+                        mimeType,
+                        "UTF-8",
+                        null,
+                        null,
+                        location.getFile(),
+                        "", // TODO: AUTHOR
+                        null,
+                        null,
+                        contents.getBytes("UTF-8"),
+                        null,
+                        null);
+                return theDoc;
+            } catch (final Exception e) { 
+                if (e instanceof InterruptedException) throw (InterruptedException) e;
+
+                /*
+                 * an unexpected error occurred, log it and throw a ParserException
+                 */
+                e.printStackTrace();
+                final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage();
+                theLogger.logSevere(errorMsg);
+                throw new ParserException(errorMsg, location);
            }
-            //unique string records
-            case SSTRecord.sid: {
-                sstrec = (SSTRecord)record;
-                for (int k = 0; k < sstrec.getNumUniqueStrings(); k++){
-                    sbFoundStrings.append( sstrec.getString(k) );
-                    
-                    //add line seperator
-                    sbFoundStrings.append( "\n" );
+        }
+
+        public void processRecord(final Record record) {
+            SSTRecord sstrec = null;
+            switch (record.getSid()){
+                case NumberRecord.sid: {
+                    final NumberRecord numrec = (NumberRecord) record;
+                    sbFoundStrings.append(numrec.getValue());
+                    break;
                }
-                break;
+                //unique string records
+                case SSTRecord.sid: {
+                    sstrec = (SSTRecord) record;
+                    for (int k = 0; k < sstrec.getNumUniqueStrings(); k++){
+                        sbFoundStrings.append( sstrec.getString(k) );
+                        
+                        //add line seperator
+                        sbFoundStrings.append( "\n" );
+                    }
+                    break;
+                }
+                /*
+                case LabelSSTRecord.sid: {
+                    final LabelSSTRecord lsrec = (LabelSSTRecord)record;
+                    sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) );
+                    break;
+                }
+                */
            }
            
-            case LabelSSTRecord.sid: {
-                final LabelSSTRecord lsrec = (LabelSSTRecord)record;
-                sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) );
-                break;
-            }
+            //add line seperator
+            sbFoundStrings.append( "\n" );
        }
-        
-        //add line seperator
-        sbFoundStrings.append( "\n" );
    }
+    
 }