- fix for xls parser

- fix for image parser
- temporary integration of images as document types in the crawler and indexer for testing of the image parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6435 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 5e8038ac4d
commit 52470d0de4

@ -163,7 +163,7 @@ public class IndexControlRWIs_p {
index = null; index = null;
} }
if (delurlref) { if (delurlref) {
for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true); for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
} }
if (delurl || delurlref) { if (delurl || delurlref) {
for (i = 0; i < urlx.length; i++) { for (i = 0; i < urlx.length; i++) {
@ -180,7 +180,7 @@ public class IndexControlRWIs_p {
// delete selected URLs // delete selected URLs
if (post.containsKey("keyhashdelete")) try { if (post.containsKey("keyhashdelete")) try {
if (delurlref) { if (delurlref) {
for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true); for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
} }
if (delurl || delurlref) { if (delurl || delurlref) {
for (i = 0; i < urlx.length; i++) { for (i = 0; i < urlx.length; i++) {

@ -136,7 +136,7 @@ public class IndexControlURLs_p {
prop.put("result", " "); prop.put("result", " ");
if (post.containsKey("urlhashdeleteall")) { if (post.containsKey("urlhashdeleteall")) {
i = sb.removeAllUrlReferences(segment, urlhash, true); i = segment.removeAllUrlReferences(urlhash, sb.loader, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
prop.put("lurlexport", 0); prop.put("lurlexport", 0);
prop.put("reload", 0); prop.put("reload", 0);

@ -670,14 +670,16 @@ public class Response {
// -ranges in request // -ranges in request
// we checked that in shallStoreCache // we checked that in shallStoreCache
// a picture cannot be indexed // check if pictures can be indexed
if (responseHeader != null) { if (responseHeader != null) {
final String mimeType = responseHeader.mime(); final String mimeType = responseHeader.mime();
if (Classification.isPictureMime(mimeType)) { return "Media_Content_(Picture)"; }
String parserError = TextParser.supportsMime(mimeType); String parserError = TextParser.supportsMime(mimeType);
if (parserError != null) { return "Media_Content, parser error: " + parserError; } if (parserError != null) { return "Media_Content, parser error: " + parserError; }
} }
if (Classification.isMediaExtension(url().getFileExtension())) { return "Media_Content_(forbidden)"; } if (Classification.isMediaExtension(url().getFileExtension()) &&
!Classification.isImageExtension((url().getFileExtension()))) {
return "Media_Content_(forbidden)";
}
// -if-modified-since in request // -if-modified-since in request
// if the page is fresh at the very moment we can index it // if the page is fresh at the very moment we can index it

@ -28,14 +28,18 @@ package de.anomic.search;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Date; import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.navigation.NavigationReference; import net.yacy.kelondro.data.navigation.NavigationReference;
@ -53,6 +57,7 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.ISO639; import net.yacy.kelondro.util.ISO639;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
@ -349,6 +354,70 @@ public class Segment {
return newEntry; return newEntry;
} }
// method for index deletion
public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final boolean fetchOnline) {
return removeAllUrlReferences(url.hash(), loader, fetchOnline);
}
public int removeAllUrlReferences(final String urlhash, LoaderDispatcher loader, final boolean fetchOnline) {
// find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry
if (urlhash == null) return 0;
// determine the url string
final URIMetadataRow entry = urlMetadata().load(urlhash, null, 0);
if (entry == null) return 0;
final URIMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) return 0;
InputStream resourceContent = null;
try {
// get the resource content
Object[] resource = null;
try {
resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
} catch (IOException e) {
Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
}
if (resource == null) {
// delete just the url entry
urlMetadata().remove(urlhash);
return 0;
} else {
resourceContent = (InputStream) resource[0];
final Long resourceContentLength = (Long) resource[1];
// parse the resource
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
// get the word set
Set<String> words = null;
try {
words = new Condenser(document, true, true).words().keySet();
} catch (final UnsupportedEncodingException e) {
e.printStackTrace();
}
// delete all word references
int count = 0;
if (words != null) count = termIndex().remove(Word.words2hashes(words), urlhash);
// finally delete the url entry itself
urlMetadata().remove(urlhash);
return count;
}
} catch (final ParserException e) {
return 0;
} catch (IOException e) {
e.printStackTrace();
return 0;
} finally {
if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
}
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox // The Cleaner class was provided as "UrldbCleaner" by Hydrox
public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) { public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) {
return new ReferenceCleaner(startHash); return new ReferenceCleaner(startHash);

@ -89,7 +89,6 @@ import java.io.BufferedInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
@ -105,7 +104,6 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
@ -119,6 +117,7 @@ import net.yacy.document.ParserException;
import net.yacy.document.content.DCEntry; import net.yacy.document.content.DCEntry;
import net.yacy.document.content.RSSMessage; import net.yacy.document.content.RSSMessage;
import net.yacy.document.content.file.SurrogateReader; import net.yacy.document.content.file.SurrogateReader;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.xml.RSSFeed; import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -1169,7 +1168,6 @@ public final class Switchboard extends serverSwitch {
noIndexReason = TextParser.supports(response.url(), response.getMimeType()); noIndexReason = TextParser.supports(response.url(), response.getMimeType());
} }
// check X-YACY-Index-Control // check X-YACY-Index-Control
// With the X-YACY-Index-Control header set to "no-index" a client could disallow // With the X-YACY-Index-Control header set to "no-index" a client could disallow
// yacy to index the response returned as answer to a request // yacy to index the response returned as answer to a request
@ -1583,43 +1581,43 @@ public final class Switchboard extends serverSwitch {
return new indexingQueueEntry(in.process, in.queueEntry, document, null); return new indexingQueueEntry(in.process, in.queueEntry, document, null);
} }
private Document parseDocument(Response entry) throws InterruptedException { private Document parseDocument(Response response) throws InterruptedException {
Document document = null; Document document = null;
final EventOrigin processCase = entry.processCase(peers.mySeed().hash); final EventOrigin processCase = response.processCase(peers.mySeed().hash);
if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase + if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase +
", depth=" + entry.depth() + ", depth=" + response.depth() +
", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().depth())) + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) +
", must-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustMatchPattern().toString()) + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) +
", must-not-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustNotMatchPattern().toString()) + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) +
", initiatorHash=" + entry.initiator() + ", initiatorHash=" + response.initiator() +
//", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) + //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
", url=" + entry.url()); // DEBUG ", url=" + response.url()); // DEBUG
// PARSE CONTENT // PARSE CONTENT
final long parsingStartTime = System.currentTimeMillis(); final long parsingStartTime = System.currentTimeMillis();
byte[] b = null; byte[] b = null;
try { try {
// fetch the document // fetch the document
b = Cache.getContent(entry.url()); b = Cache.getContent(response.url());
if (b == null) { if (b == null) {
this.log.logWarning("the resource '" + entry.url() + "' is missing in the cache."); this.log.logWarning("the resource '" + response.url() + "' is missing in the cache.");
addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), "missing"); addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing");
return null; return null;
} }
} catch (IOException e) { } catch (IOException e) {
this.log.logWarning("Unable fetch the resource '" + entry.url() + "'. from the cache: " + e.getMessage()); this.log.logWarning("Unable fetch the resource '" + response.url() + "'. from the cache: " + e.getMessage());
addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage()); addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
return null; return null;
} }
try { try {
// parse the document // parse the document
document = TextParser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), b); document = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b);
assert(document != null) : "Unexpected error. Parser returned null."; assert(document != null) : "Unexpected error. Parser returned null.";
} catch (final ParserException e) { } catch (final ParserException e) {
this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage(), e); this.log.logWarning("Unable to parse the resource '" + response.url() + "'. " + e.getMessage(), e);
addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage()); addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
if (document != null) { if (document != null) {
document.close(); document.close();
document = null; document = null;
@ -1630,43 +1628,48 @@ public final class Switchboard extends serverSwitch {
final long parsingEndTime = System.currentTimeMillis(); final long parsingEndTime = System.currentTimeMillis();
// get the document date // get the document date
final Date docDate = entry.lastModified(); final Date docDate = response.lastModified();
// put anchors on crawl stack // put anchors on crawl stack
final long stackStartTime = System.currentTimeMillis(); final long stackStartTime = System.currentTimeMillis();
if ( if (
((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) && ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) &&
((entry.profile() == null) || (entry.depth() < entry.profile().depth())) ((response.profile() == null) || (response.depth() < response.profile().depth()))
) { ) {
// get the hyperlinks
final Map<DigestURI, String> hl = document.getHyperlinks(); final Map<DigestURI, String> hl = document.getHyperlinks();
final Iterator<Map.Entry<DigestURI, String>> i = hl.entrySet().iterator();
// add all images also to the crawl stack
for (ImageEntry imageReference : document.getImages().values()) {
hl.put(imageReference.url(), imageReference.alt());
}
// insert those hyperlinks to the crawler
DigestURI nextUrl; DigestURI nextUrl;
Map.Entry<DigestURI, String> nextEntry; for (Map.Entry<DigestURI, String> nextEntry : hl.entrySet()) {
while (i.hasNext()) {
// check for interruption // check for interruption
checkInterruption(); checkInterruption();
// fetching the next hyperlink // process the next hyperlink
nextEntry = i.next();
nextUrl = nextEntry.getKey(); nextUrl = nextEntry.getKey();
String u = nextUrl.toNormalform(true, true); String u = nextUrl.toNormalform(true, true);
if (!(u.startsWith("http") || u.startsWith("ftp"))) continue; if (!(u.startsWith("http") || u.startsWith("ftp"))) continue;
// enqueue the hyperlink into the pre-notice-url db // enqueue the hyperlink into the pre-notice-url db
crawlStacker.enqueueEntry(new Request( crawlStacker.enqueueEntry(new Request(
entry.initiator(), response.initiator(),
nextUrl, nextUrl,
entry.url().hash(), response.url().hash(),
nextEntry.getValue(), nextEntry.getValue(),
null, null,
docDate, docDate,
entry.profile().handle(), response.profile().handle(),
entry.depth() + 1, response.depth() + 1,
0, 0,
0 0
)); ));
} }
final long stackEndTime = System.currentTimeMillis(); final long stackEndTime = System.currentTimeMillis();
if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) + if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + response.url().toNormalform(false, true) +
", STACKING TIME = " + (stackEndTime-stackStartTime) + ", STACKING TIME = " + (stackEndTime-stackStartTime) +
", PARSING TIME = " + (parsingEndTime-parsingStartTime)); ", PARSING TIME = " + (parsingEndTime-parsingStartTime));
} }
@ -1807,68 +1810,6 @@ public final class Switchboard extends serverSwitch {
} }
} }
// method for index deletion
public int removeAllUrlReferences(Segment indexSegment, final DigestURI url, final boolean fetchOnline) {
return removeAllUrlReferences(indexSegment, url.hash(), fetchOnline);
}
public int removeAllUrlReferences(Segment indexSegment, final String urlhash, final boolean fetchOnline) {
// find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry
if (urlhash == null) return 0;
// determine the url string
final URIMetadataRow entry = indexSegment.urlMetadata().load(urlhash, null, 0);
if (entry == null) return 0;
final URIMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) return 0;
InputStream resourceContent = null;
try {
// get the resource content
Object[] resource = null;
try {
resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
} catch (IOException e) {
Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
}
if (resource == null) {
// delete just the url entry
indexSegment.urlMetadata().remove(urlhash);
return 0;
} else {
resourceContent = (InputStream) resource[0];
final Long resourceContentLength = (Long) resource[1];
// parse the resource
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
// get the word set
Set<String> words = null;
try {
words = new Condenser(document, true, true).words().keySet();
} catch (final UnsupportedEncodingException e) {
e.printStackTrace();
}
// delete all word references
int count = 0;
if (words != null) count = indexSegment.termIndex().remove(Word.words2hashes(words), urlhash);
// finally delete the url entry itself
indexSegment.urlMetadata().remove(urlhash);
return count;
}
} catch (final ParserException e) {
return 0;
} catch (IOException e) {
e.printStackTrace();
return 0;
} finally {
if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
}
}
public int adminAuthenticated(final RequestHeader requestHeader) { public int adminAuthenticated(final RequestHeader requestHeader) {
// authorization for localhost, only if flag is set to grant localhost access as admin // authorization for localhost, only if flag is set to grant localhost access as admin

@ -239,13 +239,13 @@ public final class TextParser {
* check if the parser supports the given content. * check if the parser supports the given content.
* @param url * @param url
* @param mimeType * @param mimeType
* @return returns null if the content is supportet. If the content is not supported, return a error string. * @return returns null if the content is supported. If the content is not supported, return a error string.
*/ */
public static String supports(final DigestURI url, String mimeType) { public static String supports(final DigestURI url, String mimeType) {
try { try {
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok. // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
idiomParser(url, mimeType); List<Idiom> idioms = idiomParser(url, mimeType);
return null; return (idioms == null || idioms.size() == 0) ? "no parser found" : null;
} catch (ParserException e) { } catch (ParserException e) {
// in case that a parser is not available, return a error string describing the problem. // in case that a parser is not available, return a error string describing the problem.
return e.getMessage(); return e.getMessage();

@ -327,6 +327,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return anchors; return anchors;
} }
/**
* get all images
* @return a map of <urlhash, ImageEntry>
*/
public HashMap<String, ImageEntry> getImages() { public HashMap<String, ImageEntry> getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation // this resturns a String(absolute url)/htmlFilterImageEntry - relation
return images; return images;

@ -81,25 +81,26 @@ public class genericImageParser extends AbstractParser implements Idiom {
throw new ParserException(e.getMessage(), location); throw new ParserException(e.getMessage(), location);
} }
/*
// scan the image // scan the image
int height = image.getHeight(); int height = image.getHeight();
int width = image.getWidth(); int width = image.getWidth();
Raster raster = image.getData(); Raster raster = image.getData();
int[] pixel = raster.getPixel(0, 0, (int[])null); int[] pixel = raster.getPixel(0, 0, (int[])null);
long[] average = new long[]{0, 0, 0}; long[] average = new long[pixel.length];
for (int i = 0; i < average.length; i++) average[i] = 0L;
int pc = 0; int pc = 0;
for (int x = width / 4; x < 3 * width / 4; x = x + 2) { for (int x = width / 4; x < 3 * width / 4; x = x + 2) {
for (int y = height / 4; y < 3 * height / 4; y = y + 2) { for (int y = height / 4; y < 3 * height / 4; y = y + 2) {
pixel = raster.getPixel(x, y, pixel); pixel = raster.getPixel(x, y, pixel);
average[0] += pixel[0]; for (int i = 0; i < average.length; i++) average[i] += pixel[i];
average[1] += pixel[1];
average[2] += pixel[2];
pc++; pc++;
} }
} }
*/
// get image properties // get image properties
String [] propNames = image.getPropertyNames(); String [] propNames = image.getPropertyNames();
if (propNames == null) propNames = new String[0];
StringBuilder sb = new StringBuilder(propNames.length * 80); StringBuilder sb = new StringBuilder(propNames.length * 80);
for (String propName: propNames) { for (String propName: propNames) {
sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n"); sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n");

@ -40,20 +40,13 @@ import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest; import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.Record; import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class xlsParser extends AbstractParser implements Idiom, HSSFListener { public class xlsParser extends AbstractParser implements Idiom {
//StringBuilder for parsed text
private StringBuilder sbFoundStrings = null;
//sstrecord needed for event parsing
private SSTRecord sstrec;
/** /**
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
@ -82,12 +75,45 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
* parses the source documents and returns a plasmaParserDocument containing * parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document * all extracted information about the parsed document
*/ */
public Document parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source) throws ParserException,
InterruptedException {
return new XLSHSSFListener().parse(location, mimeType, charset, source);
}
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override
public void reset(){
//nothing to do
super.reset();
}
public class XLSHSSFListener implements HSSFListener {
//StringBuilder for parsed text
private final StringBuilder sbFoundStrings;
public XLSHSSFListener() {
this.sbFoundStrings = new StringBuilder(100);
}
/*
* parses the source documents and returns a Document containing
* all extracted information about the parsed document
*/
public Document parse(final DigestURI location, final String mimeType, public Document parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source) throws ParserException, final String charset, final InputStream source) throws ParserException,
InterruptedException { InterruptedException {
try { try {
//generate new StringBuilder for parsing
sbFoundStrings = new StringBuilder();
//create a new org.apache.poi.poifs.filesystem.Filesystem //create a new org.apache.poi.poifs.filesystem.Filesystem
final POIFSFileSystem poifs = new POIFSFileSystem(source); final POIFSFileSystem poifs = new POIFSFileSystem(source);
@ -133,28 +159,13 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
*/ */
e.printStackTrace(); e.printStackTrace();
final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage(); final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage();
this.theLogger.logSevere(errorMsg); theLogger.logSevere(errorMsg);
throw new ParserException(errorMsg, location); throw new ParserException(errorMsg, location);
} finally {
sbFoundStrings = null;
} }
} }
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override
public void reset(){
//nothing to do
super.reset();
}
public void processRecord(final Record record) { public void processRecord(final Record record) {
SSTRecord sstrec = null;
switch (record.getSid()){ switch (record.getSid()){
case NumberRecord.sid: { case NumberRecord.sid: {
final NumberRecord numrec = (NumberRecord) record; final NumberRecord numrec = (NumberRecord) record;
@ -172,15 +183,18 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
} }
break; break;
} }
/*
case LabelSSTRecord.sid: { case LabelSSTRecord.sid: {
final LabelSSTRecord lsrec = (LabelSSTRecord)record; final LabelSSTRecord lsrec = (LabelSSTRecord)record;
sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) ); sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) );
break; break;
} }
*/
} }
//add line seperator //add line seperator
sbFoundStrings.append( "\n" ); sbFoundStrings.append( "\n" );
} }
} }
}

Loading…
Cancel
Save