From 52470d0de4cd7ab95e33846a10e84b0d8142f6a4 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 22 Oct 2009 22:38:04 +0000
Subject: [PATCH] - fix for xls parser - fix for image parser - temporary
 integration of images as document types in the crawler and indexer for
 testing of the image parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6435 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/IndexControlRWIs_p.java                |   4 +-
 htroot/IndexControlURLs_p.java                |   2 +-
 .../de/anomic/crawler/retrieval/Response.java |   8 +-
 source/de/anomic/search/Segment.java          |  69 +++++++
 source/de/anomic/search/Switchboard.java      | 127 ++++--------
 source/net/yacy/document/TextParser.java      |   6 +-
 .../document/parser/html/ContentScraper.java  |   4 +
 .../parser/images/genericImageParser.java     |  11 +-
 .../net/yacy/document/parser/xlsParser.java   | 182 ++++++++++--------
 9 files changed, 222 insertions(+), 191 deletions(-)

diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java
index 22ad4c7ac..215d0433b 100644
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@@ -163,7 +163,7 @@ public class IndexControlRWIs_p {
                     index = null;
                 }
                 if (delurlref) {
-                    for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true);
+                    for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
                 }
                 if (delurl || delurlref) {
                     for (i = 0; i < urlx.length; i++) {
@@ -180,7 +180,7 @@ public class IndexControlRWIs_p {
             // delete selected URLs
             if (post.containsKey("keyhashdelete")) try {
                 if (delurlref) {
-                    for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true);
+                    for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
                 }
                 if (delurl || delurlref) {
                     for (i = 0; i < urlx.length; i++) {
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java
index 72b871e58..58e884c85 100644
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@@ -136,7 +136,7 @@ public class IndexControlURLs_p {
         prop.put("result", " ");
 
         if (post.containsKey("urlhashdeleteall")) {
-            i = sb.removeAllUrlReferences(segment, urlhash, true);
+            i = segment.removeAllUrlReferences(urlhash, sb.loader, true);
             prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
             prop.put("lurlexport", 0);
             prop.put("reload", 0);
diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java
index e5162aa8a..bde4ea013 100755
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
@@ -670,14 +670,16 @@ public class Response {
         // -ranges in request
         // we checked that in shallStoreCache
 
-        // a picture cannot be indexed
+        // check if pictures can be indexed
         if (responseHeader != null) {
             final String mimeType = responseHeader.mime();
-            if (Classification.isPictureMime(mimeType)) { return "Media_Content_(Picture)"; }
             String parserError = TextParser.supportsMime(mimeType);
             if (parserError != null) { return "Media_Content, parser error: " + parserError; }
         }
-        if (Classification.isMediaExtension(url().getFileExtension())) { return "Media_Content_(forbidden)"; }
+        if (Classification.isMediaExtension(url().getFileExtension()) &&
+           !Classification.isImageExtension((url().getFileExtension()))) {
+            return "Media_Content_(forbidden)";
+        }
 
         // -if-modified-since in request
         // if the page is fresh at the very moment we can index it
diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java
index a340bb0b6..e967bc727 100644
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@@ -28,14 +28,18 @@ package de.anomic.search;
 
 import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeSet;
 
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
+import net.yacy.document.ParserException;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
 import net.yacy.kelondro.data.navigation.NavigationReference;
@@ -53,6 +57,7 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
 import net.yacy.kelondro.rwi.ReferenceFactory;
 import net.yacy.kelondro.util.ISO639;
 import net.yacy.repository.Blacklist;
+import net.yacy.repository.LoaderDispatcher;
 
 import de.anomic.crawler.retrieval.Response;
 
@@ -349,6 +354,70 @@ public class Segment {
         return newEntry;
     }
     
+
+    // method for index deletion
+    public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final boolean fetchOnline) {
+        return removeAllUrlReferences(url.hash(), loader, fetchOnline);
+    }
+    
+    public int removeAllUrlReferences(final String urlhash, LoaderDispatcher loader, final boolean fetchOnline) {
+        // find all the words in a specific resource and remove the url reference from every word index
+        // finally, delete the url entry
+        
+        if (urlhash == null) return 0;
+        // determine the url string
+        final URIMetadataRow entry = urlMetadata().load(urlhash, null, 0);
+        if (entry == null) return 0;
+        final URIMetadataRow.Components metadata = entry.metadata();
+        if (metadata.url() == null) return 0;
+        
+        InputStream resourceContent = null;
+        try {
+            // get the resource content
+            Object[] resource = null;
+            try {
+                resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
+            } catch (IOException e) {
+                Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
+            }
+            if (resource == null) {
+                // delete just the url entry
+                urlMetadata().remove(urlhash);
+                return 0;
+            } else {
+                resourceContent = (InputStream) resource[0];
+                final Long resourceContentLength = (Long) resource[1];
+                
+                // parse the resource
+                final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
+                
+                // get the word set
+                Set<String> words = null;
+                try {
+                    words = new Condenser(document, true, true).words().keySet();
+                } catch (final UnsupportedEncodingException e) {
+                    e.printStackTrace();
+                }
+                
+                // delete all word references
+                int count = 0;
+                if (words != null) count = termIndex().remove(Word.words2hashes(words), urlhash);
+                
+                // finally delete the url entry itself
+                urlMetadata().remove(urlhash);
+                return count;
+            }
+        } catch (final ParserException e) {
+            return 0;
+        } catch (IOException e) {
+            e.printStackTrace();
+            return 0;
+        } finally {
+            if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
+        }
+    }
+
+    
     //  The Cleaner class was provided as "UrldbCleaner" by Hydrox
     public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) {
         return new ReferenceCleaner(startHash);
diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java
index d3b9762c3..eff2b66a4 100644
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@@ -89,7 +89,6 @@ import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
-import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.security.NoSuchAlgorithmException;
@@ -105,7 +104,6 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
-import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
@@ -119,6 +117,7 @@ import net.yacy.document.ParserException;
 import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.RSSMessage;
 import net.yacy.document.content.file.SurrogateReader;
+import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.document.parser.xml.RSSFeed;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
@@ -1169,7 +1168,6 @@ public final class Switchboard extends serverSwitch {
             noIndexReason = TextParser.supports(response.url(), response.getMimeType());
         }
 
-        
         // check X-YACY-Index-Control
         // With the X-YACY-Index-Control header set to "no-index" a client could disallow
         // yacy to index the response returned as answer to a request
@@ -1583,43 +1581,43 @@ public final class Switchboard extends serverSwitch {
         return new indexingQueueEntry(in.process, in.queueEntry, document, null);
     }
     
-    private Document parseDocument(Response entry) throws InterruptedException {
+    private Document parseDocument(Response response) throws InterruptedException {
         Document document = null;
-        final EventOrigin processCase = entry.processCase(peers.mySeed().hash);
+        final EventOrigin processCase = response.processCase(peers.mySeed().hash);
         
         if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase +
-                ", depth=" + entry.depth() +
-                ", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().depth())) +
-                ", must-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustMatchPattern().toString()) +
-                ", must-not-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustNotMatchPattern().toString()) +
-                ", initiatorHash=" + entry.initiator() +
+                ", depth=" + response.depth() +
+                ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) +
+                ", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) +
+                ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) +
+                ", initiatorHash=" + response.initiator() +
                 //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
-                ", url=" + entry.url()); // DEBUG
+                ", url=" + response.url()); // DEBUG
         
         // PARSE CONTENT
         final long parsingStartTime = System.currentTimeMillis();
         byte[] b = null;
         try {
             // fetch the document
-            b = Cache.getContent(entry.url());
+            b = Cache.getContent(response.url());
             if (b == null) {
-                this.log.logWarning("the resource '" + entry.url() + "' is missing in the cache.");
-                addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), "missing");
+                this.log.logWarning("the resource '" + response.url() + "' is missing in the cache.");
+                addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing");
                 return null;
             }
         } catch (IOException e) {
-            this.log.logWarning("Unable fetch the resource '" + entry.url() + "'. from the cache: " + e.getMessage());
-            addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage());
+            this.log.logWarning("Unable fetch the resource '" + response.url() + "'. from the cache: " + e.getMessage());
+            addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
             return null;
         }
         
         try {
             // parse the document
-            document = TextParser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), b);
+            document = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b);
             assert(document != null) : "Unexpected error. Parser returned null.";
         } catch (final ParserException e) {
-            this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage(), e);
-            addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage());
+            this.log.logWarning("Unable to parse the resource '" + response.url() + "'. " + e.getMessage(), e);
+            addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
             if (document != null) {
                 document.close();
                 document = null;
@@ -1630,43 +1628,48 @@ public final class Switchboard extends serverSwitch {
         final long parsingEndTime = System.currentTimeMillis();            
         
         // get the document date
-        final Date docDate = entry.lastModified();
+        final Date docDate = response.lastModified();
         
         // put anchors on crawl stack
         final long stackStartTime = System.currentTimeMillis();
         if (
                 ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) &&
-                ((entry.profile() == null) || (entry.depth() < entry.profile().depth()))
+                ((response.profile() == null) || (response.depth() < response.profile().depth()))
         ) {
+            // get the hyperlinks
             final Map<DigestURI, String> hl = document.getHyperlinks();
-            final Iterator<Map.Entry<DigestURI, String>> i = hl.entrySet().iterator();
+            
+            // add all images also to the crawl stack
+            for (ImageEntry imageReference : document.getImages().values()) {
+                hl.put(imageReference.url(), imageReference.alt());
+            }
+            
+            // insert those hyperlinks to the crawler
             DigestURI nextUrl;
-            Map.Entry<DigestURI, String> nextEntry;
-            while (i.hasNext()) {
+            for (Map.Entry<DigestURI, String> nextEntry : hl.entrySet()) {
                 // check for interruption
                 checkInterruption();
                 
-                // fetching the next hyperlink
-                nextEntry = i.next();
+                // process the next hyperlink
                 nextUrl = nextEntry.getKey();
                 String u = nextUrl.toNormalform(true, true);
                 if (!(u.startsWith("http") || u.startsWith("ftp"))) continue;
                 // enqueue the hyperlink into the pre-notice-url db
                 crawlStacker.enqueueEntry(new Request(
-                        entry.initiator(),
+                        response.initiator(),
                         nextUrl,
-                        entry.url().hash(),
+                        response.url().hash(),
                         nextEntry.getValue(),
                         null,
                         docDate,
-                        entry.profile().handle(),
-                        entry.depth() + 1,
+                        response.profile().handle(),
+                        response.depth() + 1,
                         0,
                         0
                         ));
             }
             final long stackEndTime = System.currentTimeMillis();
-            if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
+            if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + response.url().toNormalform(false, true) +
                     ", STACKING TIME = " + (stackEndTime-stackStartTime) +
                     ", PARSING TIME = " + (parsingEndTime-parsingStartTime));
         }
@@ -1807,68 +1810,6 @@ public final class Switchboard extends serverSwitch {
         }
     }
     
-    // method for index deletion
-    public int removeAllUrlReferences(Segment indexSegment, final DigestURI url, final boolean fetchOnline) {
-        return removeAllUrlReferences(indexSegment, url.hash(), fetchOnline);
-    }
-    
-    public int removeAllUrlReferences(Segment indexSegment, final String urlhash, final boolean fetchOnline) {
-        // find all the words in a specific resource and remove the url reference from every word index
-        // finally, delete the url entry
-        
-        if (urlhash == null) return 0;
-        // determine the url string
-        final URIMetadataRow entry = indexSegment.urlMetadata().load(urlhash, null, 0);
-        if (entry == null) return 0;
-        final URIMetadataRow.Components metadata = entry.metadata();
-        if (metadata.url() == null) return 0;
-        
-        InputStream resourceContent = null;
-        try {
-            // get the resource content
-            Object[] resource = null;
-            try {
-                resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
-            } catch (IOException e) {
-                Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
-            }
-            if (resource == null) {
-                // delete just the url entry
-                indexSegment.urlMetadata().remove(urlhash);
-                return 0;
-            } else {
-                resourceContent = (InputStream) resource[0];
-                final Long resourceContentLength = (Long) resource[1];
-                
-                // parse the resource
-                final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
-                
-                // get the word set
-                Set<String> words = null;
-                try {
-                    words = new Condenser(document, true, true).words().keySet();
-                } catch (final UnsupportedEncodingException e) {
-                    e.printStackTrace();
-                }
-                
-                // delete all word references
-                int count = 0;
-                if (words != null) count = indexSegment.termIndex().remove(Word.words2hashes(words), urlhash);
-                
-                // finally delete the url entry itself
-                indexSegment.urlMetadata().remove(urlhash);
-                return count;
-            }
-        } catch (final ParserException e) {
-            return 0;
-        } catch (IOException e) {
-            e.printStackTrace();
-            return 0;
-        } finally {
-            if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
-        }
-    }
-
     public int adminAuthenticated(final RequestHeader requestHeader) {
         
         // authorization for localhost, only if flag is set to grant localhost access as admin
diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java
index 80f84289c..81a7ab1fc 100644
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@@ -239,13 +239,13 @@ public final class TextParser {
      * check if the parser supports the given content.
      * @param url
      * @param mimeType
-     * @return returns null if the content is supportet. If the content is not supported, return a error string.
+     * @return returns null if the content is supported. If the content is not supported, return a error string.
      */
     public static String supports(final DigestURI url, String mimeType) {
         try {
             // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
-            idiomParser(url, mimeType);
-            return null;
+            List<Idiom> idioms = idiomParser(url, mimeType);
+            return (idioms == null || idioms.size() == 0) ? "no parser found" : null;
         } catch (ParserException e) {
             // in case that a parser is not available, return a error string describing the problem.
             return e.getMessage();
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 394011a4f..69179a675 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -327,6 +327,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         return anchors;
     }
 
+    /**
+     * get all images
+     * @return a map of <urlhash, ImageEntry>
+     */
     public HashMap<String, ImageEntry> getImages() {
         // this resturns a String(absolute url)/htmlFilterImageEntry - relation
         return images;
diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java
index cfc5fa747..1ba0b8100 100644
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@@ -81,25 +81,26 @@ public class genericImageParser extends AbstractParser implements Idiom {
             throw new ParserException(e.getMessage(), location);
         }
         
+        /*
         // scan the image
         int height = image.getHeight();
         int width = image.getWidth();
         Raster raster = image.getData();
         int[] pixel = raster.getPixel(0, 0, (int[])null);
-        long[] average = new long[]{0, 0, 0};
+        long[] average = new long[pixel.length];
+        for (int i = 0; i < average.length; i++) average[i] = 0L;
         int pc = 0;
         for (int x = width / 4; x < 3 * width / 4; x = x + 2) {
             for (int y = height / 4; y < 3 * height / 4; y = y + 2) {
                 pixel = raster.getPixel(x, y, pixel);
-                average[0] += pixel[0];
-                average[1] += pixel[1];
-                average[2] += pixel[2];
+                for (int i = 0; i < average.length; i++) average[i] += pixel[i];
                 pc++;
             }
         }
-        
+        */
         // get image properties
         String [] propNames = image.getPropertyNames();
+        if (propNames == null) propNames = new String[0];
         StringBuilder sb = new StringBuilder(propNames.length * 80);
         for (String propName: propNames) {
             sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n");
diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java
index bc86a2ada..955c9253d 100644
--- a/source/net/yacy/document/parser/xlsParser.java
+++ b/source/net/yacy/document/parser/xlsParser.java
@@ -40,20 +40,13 @@ import net.yacy.kelondro.data.meta.DigestURI;
 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
 import org.apache.poi.hssf.eventusermodel.HSSFListener;
 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
-import org.apache.poi.hssf.record.LabelSSTRecord;
 import org.apache.poi.hssf.record.NumberRecord;
 import org.apache.poi.hssf.record.Record;
 import org.apache.poi.hssf.record.SSTRecord;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 
-public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
-
-    //StringBuilder for parsed text
-    private StringBuilder sbFoundStrings = null;
-    
-    //sstrecord needed for event parsing
-    private SSTRecord sstrec;
+public class xlsParser extends AbstractParser implements Idiom {
     
     /**
      * a list of mime types that are supported by this parser class
@@ -85,59 +78,7 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
     public Document parse(final DigestURI location, final String mimeType,
             final String charset, final InputStream source) throws ParserException,
             InterruptedException {
-        try {
-            //generate new StringBuilder for parsing
-            sbFoundStrings = new StringBuilder();
-            
-            //create a new org.apache.poi.poifs.filesystem.Filesystem
-            final POIFSFileSystem poifs = new POIFSFileSystem(source);
-            //get the Workbook (excel part) stream in a InputStream
-            final InputStream din = poifs.createDocumentInputStream("Workbook");
-            //construct out HSSFRequest object
-            final HSSFRequest req = new HSSFRequest();
-            //lazy listen for ALL records with the listener shown above
-            req.addListenerForAllRecords(this);
-            //create our event factory
-            final HSSFEventFactory factory = new HSSFEventFactory();
-            //process our events based on the document input stream
-            factory.processEvents(req, din);
-            //close our document input stream (don't want to leak these!)
-            din.close();
-            
-            //now the parsed strings are in the StringBuilder, now convert them to a String
-            final String contents = sbFoundStrings.toString().trim();
-            
-            /*
-             * create the plasmaParserDocument for the database
-             * and set shortText and bodyText properly
-             */
-            final Document theDoc = new Document(
-                    location,
-                    mimeType,
-                    "UTF-8",
-                    null,
-                    null,
-                    location.getFile(),
-                    "", // TODO: AUTHOR
-                    null,
-                    null,
-                    contents.getBytes("UTF-8"),
-                    null,
-                    null);
-            return theDoc;
-        } catch (final Exception e) { 
-            if (e instanceof InterruptedException) throw (InterruptedException) e;
-
-            /*
-             * an unexpected error occurred, log it and throw a ParserException
-             */
-            e.printStackTrace();
-            final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage();
-            this.theLogger.logSevere(errorMsg);
-            throw new ParserException(errorMsg, location);
-        } finally {
-            sbFoundStrings = null;
-        }
+        return new XLSHSSFListener().parse(location, mimeType, charset, source);
     }
     
     public Set<String> supportedMimeTypes() {
@@ -153,34 +94,107 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
         //nothing to do
         super.reset();
     }
+    
+    
+    public class XLSHSSFListener implements HSSFListener {
+
+        //StringBuilder for parsed text
+        private final StringBuilder sbFoundStrings;
+        
 
-    public void processRecord(final Record record) {
-        switch (record.getSid()){
-            case NumberRecord.sid: {
-                final NumberRecord numrec = (NumberRecord) record;
-                sbFoundStrings.append(numrec.getValue());
-                break;
+        public XLSHSSFListener() {
+            this.sbFoundStrings = new StringBuilder(100);
+        }
+
+        /*
+         * parses the source documents and returns a Document containing
+         * all extracted information about the parsed document
+         */ 
+        public Document parse(final DigestURI location, final String mimeType,
+                final String charset, final InputStream source) throws ParserException,
+                InterruptedException {
+            try {
+                
+                //create a new org.apache.poi.poifs.filesystem.Filesystem
+                final POIFSFileSystem poifs = new POIFSFileSystem(source);
+                //get the Workbook (excel part) stream in a InputStream
+                final InputStream din = poifs.createDocumentInputStream("Workbook");
+                //construct out HSSFRequest object
+                final HSSFRequest req = new HSSFRequest();
+                //lazy listen for ALL records with the listener shown above
+                req.addListenerForAllRecords(this);
+                //create our event factory
+                final HSSFEventFactory factory = new HSSFEventFactory();
+                //process our events based on the document input stream
+                factory.processEvents(req, din);
+                //close our document input stream (don't want to leak these!)
+                din.close();
+                
+                //now the parsed strings are in the StringBuilder, now convert them to a String
+                final String contents = sbFoundStrings.toString().trim();
+                
+                /*
+                 * create the plasmaParserDocument for the database
+                 * and set shortText and bodyText properly
+                 */
+                final Document theDoc = new Document(
+                        location,
+                        mimeType,
+                        "UTF-8",
+                        null,
+                        null,
+                        location.getFile(),
+                        "", // TODO: AUTHOR
+                        null,
+                        null,
+                        contents.getBytes("UTF-8"),
+                        null,
+                        null);
+                return theDoc;
+            } catch (final Exception e) { 
+                if (e instanceof InterruptedException) throw (InterruptedException) e;
+
+                /*
+                 * an unexpected error occurred, log it and throw a ParserException
+                 */
+                e.printStackTrace();
+                final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage();
+                theLogger.logSevere(errorMsg);
+                throw new ParserException(errorMsg, location);
             }
-            //unique string records
-            case SSTRecord.sid: {
-                sstrec = (SSTRecord)record;
-                for (int k = 0; k < sstrec.getNumUniqueStrings(); k++){
-                    sbFoundStrings.append( sstrec.getString(k) );
-                    
-                    //add line seperator
-                    sbFoundStrings.append( "\n" );
+        }
+
+        public void processRecord(final Record record) {
+            SSTRecord sstrec = null;
+            switch (record.getSid()){
+                case NumberRecord.sid: {
+                    final NumberRecord numrec = (NumberRecord) record;
+                    sbFoundStrings.append(numrec.getValue());
+                    break;
                 }
-                break;
+                //unique string records
+                case SSTRecord.sid: {
+                    sstrec = (SSTRecord) record;
+                    for (int k = 0; k < sstrec.getNumUniqueStrings(); k++){
+                        sbFoundStrings.append( sstrec.getString(k) );
+                        
+                        //add line seperator
+                        sbFoundStrings.append( "\n" );
+                    }
+                    break;
+                }
+                /*
+                case LabelSSTRecord.sid: {
+                    final LabelSSTRecord lsrec = (LabelSSTRecord)record;
+                    sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) );
+                    break;
+                }
+                */
             }
             
-            case LabelSSTRecord.sid: {
-                final LabelSSTRecord lsrec = (LabelSSTRecord)record;
-                sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) );
-                break;
-            }
+            //add line seperator
+            sbFoundStrings.append( "\n" );
         }
-        
-        //add line seperator
-        sbFoundStrings.append( "\n" );
     }
+    
 }