From 52470d0de4cd7ab95e33846a10e84b0d8142f6a4 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 22 Oct 2009 22:38:04 +0000 Subject: [PATCH] - fix for xls parser - fix for image parser - temporary integration of images as document types in the crawler and indexer for testing of the image parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6435 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControlRWIs_p.java | 4 +- htroot/IndexControlURLs_p.java | 2 +- .../de/anomic/crawler/retrieval/Response.java | 8 +- source/de/anomic/search/Segment.java | 69 +++++++ source/de/anomic/search/Switchboard.java | 127 ++++-------- source/net/yacy/document/TextParser.java | 6 +- .../document/parser/html/ContentScraper.java | 4 + .../parser/images/genericImageParser.java | 11 +- .../net/yacy/document/parser/xlsParser.java | 182 ++++++++++-------- 9 files changed, 222 insertions(+), 191 deletions(-) diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 22ad4c7ac..215d0433b 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -163,7 +163,7 @@ public class IndexControlRWIs_p { index = null; } if (delurlref) { - for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true); + for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true); } if (delurl || delurlref) { for (i = 0; i < urlx.length; i++) { @@ -180,7 +180,7 @@ public class IndexControlRWIs_p { // delete selected URLs if (post.containsKey("keyhashdelete")) try { if (delurlref) { - for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true); + for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true); } if (delurl || delurlref) { for (i = 0; i < urlx.length; i++) { diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 72b871e58..58e884c85 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -136,7 +136,7 @@ public class IndexControlURLs_p { prop.put("result", " "); if (post.containsKey("urlhashdeleteall")) { - i = sb.removeAllUrlReferences(segment, urlhash, true); + i = segment.removeAllUrlReferences(urlhash, sb.loader, true); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); prop.put("lurlexport", 0); prop.put("reload", 0); diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index e5162aa8a..bde4ea013 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -670,14 +670,16 @@ public class Response { // -ranges in request // we checked that in shallStoreCache - // a picture cannot be indexed + // check if pictures can be indexed if (responseHeader != null) { final String mimeType = responseHeader.mime(); - if (Classification.isPictureMime(mimeType)) { return "Media_Content_(Picture)"; } String parserError = TextParser.supportsMime(mimeType); if (parserError != null) { return "Media_Content, parser error: " + parserError; } } - if (Classification.isMediaExtension(url().getFileExtension())) { return "Media_Content_(forbidden)"; } + if (Classification.isMediaExtension(url().getFileExtension()) && + !Classification.isImageExtension((url().getFileExtension()))) { + return "Media_Content_(forbidden)"; + } // -if-modified-since in request // if the page is fresh at the very moment we can index it diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index a340bb0b6..e967bc727 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -28,14 +28,18 @@ package de.anomic.search; import java.io.File; import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.Map; +import java.util.Set; import java.util.TreeSet; import net.yacy.document.Condenser; import net.yacy.document.Document; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.navigation.NavigationReference; @@ -53,6 +57,7 @@ import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.util.ISO639; import net.yacy.repository.Blacklist; +import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.retrieval.Response; @@ -349,6 +354,70 @@ public class Segment { return newEntry; } + + // method for index deletion + public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final boolean fetchOnline) { + return removeAllUrlReferences(url.hash(), loader, fetchOnline); + } + + public int removeAllUrlReferences(final String urlhash, LoaderDispatcher loader, final boolean fetchOnline) { + // find all the words in a specific resource and remove the url reference from every word index + // finally, delete the url entry + + if (urlhash == null) return 0; + // determine the url string + final URIMetadataRow entry = urlMetadata().load(urlhash, null, 0); + if (entry == null) return 0; + final URIMetadataRow.Components metadata = entry.metadata(); + if (metadata.url() == null) return 0; + + InputStream resourceContent = null; + try { + // get the resource content + Object[] resource = null; + try { + resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false); + } catch (IOException e) { + Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage()); + } + if (resource == null) { + // delete just the url entry + urlMetadata().remove(urlhash); + return 0; + } else { + resourceContent = (InputStream) resource[0]; + final Long resourceContentLength = (Long) resource[1]; + + // parse the resource + final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null); + + // get the word set + Set words = null; + try { + words = new Condenser(document, true, true).words().keySet(); + } catch (final UnsupportedEncodingException e) { + e.printStackTrace(); + } + + // delete all word references + int count = 0; + if (words != null) count = termIndex().remove(Word.words2hashes(words), urlhash); + + // finally delete the url entry itself + urlMetadata().remove(urlhash); + return count; + } + } catch (final ParserException e) { + return 0; + } catch (IOException e) { + e.printStackTrace(); + return 0; + } finally { + if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */} + } + } + + // The Cleaner class was provided as "UrldbCleaner" by Hydrox public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) { return new ReferenceCleaner(startHash); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index d3b9762c3..eff2b66a4 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -89,7 +89,6 @@ import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.security.NoSuchAlgorithmException; @@ -105,7 +104,6 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; -import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; @@ -119,6 +117,7 @@ import net.yacy.document.ParserException; import net.yacy.document.content.DCEntry; import net.yacy.document.content.RSSMessage; import net.yacy.document.content.file.SurrogateReader; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -1169,7 +1168,6 @@ public final class Switchboard extends serverSwitch { noIndexReason = TextParser.supports(response.url(), response.getMimeType()); } - // check X-YACY-Index-Control // With the X-YACY-Index-Control header set to "no-index" a client could disallow // yacy to index the response returned as answer to a request @@ -1583,43 +1581,43 @@ public final class Switchboard extends serverSwitch { return new indexingQueueEntry(in.process, in.queueEntry, document, null); } - private Document parseDocument(Response entry) throws InterruptedException { + private Document parseDocument(Response response) throws InterruptedException { Document document = null; - final EventOrigin processCase = entry.processCase(peers.mySeed().hash); + final EventOrigin processCase = response.processCase(peers.mySeed().hash); if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase + - ", depth=" + entry.depth() + - ", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().depth())) + - ", must-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustMatchPattern().toString()) + - ", must-not-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustNotMatchPattern().toString()) + - ", initiatorHash=" + entry.initiator() + + ", depth=" + response.depth() + + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) + + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) + + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) + + ", initiatorHash=" + response.initiator() + //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) + - ", url=" + entry.url()); // DEBUG + ", url=" + response.url()); // DEBUG // PARSE CONTENT final long parsingStartTime = System.currentTimeMillis(); byte[] b = null; try { // fetch the document - b = Cache.getContent(entry.url()); + b = Cache.getContent(response.url()); if (b == null) { - this.log.logWarning("the resource '" + entry.url() + "' is missing in the cache."); - addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), "missing"); + this.log.logWarning("the resource '" + response.url() + "' is missing in the cache."); + addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing"); return null; } } catch (IOException e) { - this.log.logWarning("Unable fetch the resource '" + entry.url() + "'. from the cache: " + e.getMessage()); - addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage()); + this.log.logWarning("Unable fetch the resource '" + response.url() + "'. from the cache: " + e.getMessage()); + addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage()); return null; } try { // parse the document - document = TextParser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), b); + document = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b); assert(document != null) : "Unexpected error. Parser returned null."; } catch (final ParserException e) { - this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage(), e); - addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage()); + this.log.logWarning("Unable to parse the resource '" + response.url() + "'. " + e.getMessage(), e); + addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage()); if (document != null) { document.close(); document = null; @@ -1630,43 +1628,48 @@ public final class Switchboard extends serverSwitch { final long parsingEndTime = System.currentTimeMillis(); // get the document date - final Date docDate = entry.lastModified(); + final Date docDate = response.lastModified(); // put anchors on crawl stack final long stackStartTime = System.currentTimeMillis(); if ( ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) && - ((entry.profile() == null) || (entry.depth() < entry.profile().depth())) + ((response.profile() == null) || (response.depth() < response.profile().depth())) ) { + // get the hyperlinks final Map hl = document.getHyperlinks(); - final Iterator> i = hl.entrySet().iterator(); + + // add all images also to the crawl stack + for (ImageEntry imageReference : document.getImages().values()) { + hl.put(imageReference.url(), imageReference.alt()); + } + + // insert those hyperlinks to the crawler DigestURI nextUrl; - Map.Entry nextEntry; - while (i.hasNext()) { + for (Map.Entry nextEntry : hl.entrySet()) { // check for interruption checkInterruption(); - // fetching the next hyperlink - nextEntry = i.next(); + // process the next hyperlink nextUrl = nextEntry.getKey(); String u = nextUrl.toNormalform(true, true); if (!(u.startsWith("http") || u.startsWith("ftp"))) continue; // enqueue the hyperlink into the pre-notice-url db crawlStacker.enqueueEntry(new Request( - entry.initiator(), + response.initiator(), nextUrl, - entry.url().hash(), + response.url().hash(), nextEntry.getValue(), null, docDate, - entry.profile().handle(), - entry.depth() + 1, + response.profile().handle(), + response.depth() + 1, 0, 0 )); } final long stackEndTime = System.currentTimeMillis(); - if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) + + if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + response.url().toNormalform(false, true) + ", STACKING TIME = " + (stackEndTime-stackStartTime) + ", PARSING TIME = " + (parsingEndTime-parsingStartTime)); } @@ -1807,68 +1810,6 @@ public final class Switchboard extends serverSwitch { } } - // method for index deletion - public int removeAllUrlReferences(Segment indexSegment, final DigestURI url, final boolean fetchOnline) { - return removeAllUrlReferences(indexSegment, url.hash(), fetchOnline); - } - - public int removeAllUrlReferences(Segment indexSegment, final String urlhash, final boolean fetchOnline) { - // find all the words in a specific resource and remove the url reference from every word index - // finally, delete the url entry - - if (urlhash == null) return 0; - // determine the url string - final URIMetadataRow entry = indexSegment.urlMetadata().load(urlhash, null, 0); - if (entry == null) return 0; - final URIMetadataRow.Components metadata = entry.metadata(); - if (metadata.url() == null) return 0; - - InputStream resourceContent = null; - try { - // get the resource content - Object[] resource = null; - try { - resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false); - } catch (IOException e) { - Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage()); - } - if (resource == null) { - // delete just the url entry - indexSegment.urlMetadata().remove(urlhash); - return 0; - } else { - resourceContent = (InputStream) resource[0]; - final Long resourceContentLength = (Long) resource[1]; - - // parse the resource - final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null); - - // get the word set - Set words = null; - try { - words = new Condenser(document, true, true).words().keySet(); - } catch (final UnsupportedEncodingException e) { - e.printStackTrace(); - } - - // delete all word references - int count = 0; - if (words != null) count = indexSegment.termIndex().remove(Word.words2hashes(words), urlhash); - - // finally delete the url entry itself - indexSegment.urlMetadata().remove(urlhash); - return count; - } - } catch (final ParserException e) { - return 0; - } catch (IOException e) { - e.printStackTrace(); - return 0; - } finally { - if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */} - } - } - public int adminAuthenticated(final RequestHeader requestHeader) { // authorization for localhost, only if flag is set to grant localhost access as admin diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 80f84289c..81a7ab1fc 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -239,13 +239,13 @@ public final class TextParser { * check if the parser supports the given content. * @param url * @param mimeType - * @return returns null if the content is supportet. If the content is not supported, return a error string. + * @return returns null if the content is supported. If the content is not supported, return a error string. */ public static String supports(final DigestURI url, String mimeType) { try { // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok. - idiomParser(url, mimeType); - return null; + List idioms = idiomParser(url, mimeType); + return (idioms == null || idioms.size() == 0) ? "no parser found" : null; } catch (ParserException e) { // in case that a parser is not available, return a error string describing the problem. return e.getMessage(); diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 394011a4f..69179a675 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -327,6 +327,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { return anchors; } + /** + * get all images + * @return a map of + */ public HashMap getImages() { // this resturns a String(absolute url)/htmlFilterImageEntry - relation return images; diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index cfc5fa747..1ba0b8100 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -81,25 +81,26 @@ public class genericImageParser extends AbstractParser implements Idiom { throw new ParserException(e.getMessage(), location); } + /* // scan the image int height = image.getHeight(); int width = image.getWidth(); Raster raster = image.getData(); int[] pixel = raster.getPixel(0, 0, (int[])null); - long[] average = new long[]{0, 0, 0}; + long[] average = new long[pixel.length]; + for (int i = 0; i < average.length; i++) average[i] = 0L; int pc = 0; for (int x = width / 4; x < 3 * width / 4; x = x + 2) { for (int y = height / 4; y < 3 * height / 4; y = y + 2) { pixel = raster.getPixel(x, y, pixel); - average[0] += pixel[0]; - average[1] += pixel[1]; - average[2] += pixel[2]; + for (int i = 0; i < average.length; i++) average[i] += pixel[i]; pc++; } } - + */ // get image properties String [] propNames = image.getPropertyNames(); + if (propNames == null) propNames = new String[0]; StringBuilder sb = new StringBuilder(propNames.length * 80); for (String propName: propNames) { sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n"); diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java index bc86a2ada..955c9253d 100644 --- a/source/net/yacy/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -40,20 +40,13 @@ import net.yacy.kelondro.data.meta.DigestURI; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFRequest; -import org.apache.poi.hssf.record.LabelSSTRecord; import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.Record; import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.poifs.filesystem.POIFSFileSystem; -public class xlsParser extends AbstractParser implements Idiom, HSSFListener { - - //StringBuilder for parsed text - private StringBuilder sbFoundStrings = null; - - //sstrecord needed for event parsing - private SSTRecord sstrec; +public class xlsParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class @@ -85,59 +78,7 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener { public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { - try { - //generate new StringBuilder for parsing - sbFoundStrings = new StringBuilder(); - - //create a new org.apache.poi.poifs.filesystem.Filesystem - final POIFSFileSystem poifs = new POIFSFileSystem(source); - //get the Workbook (excel part) stream in a InputStream - final InputStream din = poifs.createDocumentInputStream("Workbook"); - //construct out HSSFRequest object - final HSSFRequest req = new HSSFRequest(); - //lazy listen for ALL records with the listener shown above - req.addListenerForAllRecords(this); - //create our event factory - final HSSFEventFactory factory = new HSSFEventFactory(); - //process our events based on the document input stream - factory.processEvents(req, din); - //close our document input stream (don't want to leak these!) - din.close(); - - //now the parsed strings are in the StringBuilder, now convert them to a String - final String contents = sbFoundStrings.toString().trim(); - - /* - * create the plasmaParserDocument for the database - * and set shortText and bodyText properly - */ - final Document theDoc = new Document( - location, - mimeType, - "UTF-8", - null, - null, - location.getFile(), - "", // TODO: AUTHOR - null, - null, - contents.getBytes("UTF-8"), - null, - null); - return theDoc; - } catch (final Exception e) { - if (e instanceof InterruptedException) throw (InterruptedException) e; - - /* - * an unexpected error occurred, log it and throw a ParserException - */ - e.printStackTrace(); - final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage(); - this.theLogger.logSevere(errorMsg); - throw new ParserException(errorMsg, location); - } finally { - sbFoundStrings = null; - } + return new XLSHSSFListener().parse(location, mimeType, charset, source); } public Set supportedMimeTypes() { @@ -153,34 +94,107 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener { //nothing to do super.reset(); } + + + public class XLSHSSFListener implements HSSFListener { + + //StringBuilder for parsed text + private final StringBuilder sbFoundStrings; + - public void processRecord(final Record record) { - switch (record.getSid()){ - case NumberRecord.sid: { - final NumberRecord numrec = (NumberRecord) record; - sbFoundStrings.append(numrec.getValue()); - break; + public XLSHSSFListener() { + this.sbFoundStrings = new StringBuilder(100); + } + + /* + * parses the source documents and returns a Document containing + * all extracted information about the parsed document + */ + public Document parse(final DigestURI location, final String mimeType, + final String charset, final InputStream source) throws ParserException, + InterruptedException { + try { + + //create a new org.apache.poi.poifs.filesystem.Filesystem + final POIFSFileSystem poifs = new POIFSFileSystem(source); + //get the Workbook (excel part) stream in a InputStream + final InputStream din = poifs.createDocumentInputStream("Workbook"); + //construct out HSSFRequest object + final HSSFRequest req = new HSSFRequest(); + //lazy listen for ALL records with the listener shown above + req.addListenerForAllRecords(this); + //create our event factory + final HSSFEventFactory factory = new HSSFEventFactory(); + //process our events based on the document input stream + factory.processEvents(req, din); + //close our document input stream (don't want to leak these!) + din.close(); + + //now the parsed strings are in the StringBuilder, now convert them to a String + final String contents = sbFoundStrings.toString().trim(); + + /* + * create the plasmaParserDocument for the database + * and set shortText and bodyText properly + */ + final Document theDoc = new Document( + location, + mimeType, + "UTF-8", + null, + null, + location.getFile(), + "", // TODO: AUTHOR + null, + null, + contents.getBytes("UTF-8"), + null, + null); + return theDoc; + } catch (final Exception e) { + if (e instanceof InterruptedException) throw (InterruptedException) e; + + /* + * an unexpected error occurred, log it and throw a ParserException + */ + e.printStackTrace(); + final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage(); + theLogger.logSevere(errorMsg); + throw new ParserException(errorMsg, location); } - //unique string records - case SSTRecord.sid: { - sstrec = (SSTRecord)record; - for (int k = 0; k < sstrec.getNumUniqueStrings(); k++){ - sbFoundStrings.append( sstrec.getString(k) ); - - //add line seperator - sbFoundStrings.append( "\n" ); + } + + public void processRecord(final Record record) { + SSTRecord sstrec = null; + switch (record.getSid()){ + case NumberRecord.sid: { + final NumberRecord numrec = (NumberRecord) record; + sbFoundStrings.append(numrec.getValue()); + break; } - break; + //unique string records + case SSTRecord.sid: { + sstrec = (SSTRecord) record; + for (int k = 0; k < sstrec.getNumUniqueStrings(); k++){ + sbFoundStrings.append( sstrec.getString(k) ); + + //add line seperator + sbFoundStrings.append( "\n" ); + } + break; + } + /* + case LabelSSTRecord.sid: { + final LabelSSTRecord lsrec = (LabelSSTRecord)record; + sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) ); + break; + } + */ } - case LabelSSTRecord.sid: { - final LabelSSTRecord lsrec = (LabelSSTRecord)record; - sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) ); - break; - } + //add line seperator + sbFoundStrings.append( "\n" ); } - - //add line seperator - sbFoundStrings.append( "\n" ); } + }