- fix for xls parser

- fix for image parser
- temporary integration of images as document types in the crawler and indexer for testing of the image parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6435 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 5e8038ac4d
commit 52470d0de4

@ -163,7 +163,7 @@ public class IndexControlRWIs_p {
index = null;
}
if (delurlref) {
for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true);
for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
}
if (delurl || delurlref) {
for (i = 0; i < urlx.length; i++) {
@ -180,7 +180,7 @@ public class IndexControlRWIs_p {
// delete selected URLs
if (post.containsKey("keyhashdelete")) try {
if (delurlref) {
for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true);
for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
}
if (delurl || delurlref) {
for (i = 0; i < urlx.length; i++) {

@ -136,7 +136,7 @@ public class IndexControlURLs_p {
prop.put("result", " ");
if (post.containsKey("urlhashdeleteall")) {
i = sb.removeAllUrlReferences(segment, urlhash, true);
i = segment.removeAllUrlReferences(urlhash, sb.loader, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
prop.put("lurlexport", 0);
prop.put("reload", 0);

@ -670,14 +670,16 @@ public class Response {
// -ranges in request
// we checked that in shallStoreCache
// a picture cannot be indexed
// check if pictures can be indexed
if (responseHeader != null) {
final String mimeType = responseHeader.mime();
if (Classification.isPictureMime(mimeType)) { return "Media_Content_(Picture)"; }
String parserError = TextParser.supportsMime(mimeType);
if (parserError != null) { return "Media_Content, parser error: " + parserError; }
}
if (Classification.isMediaExtension(url().getFileExtension())) { return "Media_Content_(forbidden)"; }
if (Classification.isMediaExtension(url().getFileExtension()) &&
!Classification.isImageExtension((url().getFileExtension()))) {
return "Media_Content_(forbidden)";
}
// -if-modified-since in request
// if the page is fresh at the very moment we can index it

@ -28,14 +28,18 @@ package de.anomic.search;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.navigation.NavigationReference;
@ -53,6 +57,7 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.ISO639;
import net.yacy.repository.Blacklist;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
@ -349,6 +354,70 @@ public class Segment {
return newEntry;
}
// method for index deletion
public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final boolean fetchOnline) {
return removeAllUrlReferences(url.hash(), loader, fetchOnline);
}
public int removeAllUrlReferences(final String urlhash, LoaderDispatcher loader, final boolean fetchOnline) {
// find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry
if (urlhash == null) return 0;
// determine the url string
final URIMetadataRow entry = urlMetadata().load(urlhash, null, 0);
if (entry == null) return 0;
final URIMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) return 0;
InputStream resourceContent = null;
try {
// get the resource content
Object[] resource = null;
try {
resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
} catch (IOException e) {
Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
}
if (resource == null) {
// delete just the url entry
urlMetadata().remove(urlhash);
return 0;
} else {
resourceContent = (InputStream) resource[0];
final Long resourceContentLength = (Long) resource[1];
// parse the resource
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
// get the word set
Set<String> words = null;
try {
words = new Condenser(document, true, true).words().keySet();
} catch (final UnsupportedEncodingException e) {
e.printStackTrace();
}
// delete all word references
int count = 0;
if (words != null) count = termIndex().remove(Word.words2hashes(words), urlhash);
// finally delete the url entry itself
urlMetadata().remove(urlhash);
return count;
}
} catch (final ParserException e) {
return 0;
} catch (IOException e) {
e.printStackTrace();
return 0;
} finally {
if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
}
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) {
return new ReferenceCleaner(startHash);

@ -89,7 +89,6 @@ import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.security.NoSuchAlgorithmException;
@ -105,7 +104,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
@ -119,6 +117,7 @@ import net.yacy.document.ParserException;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.content.file.SurrogateReader;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -1169,7 +1168,6 @@ public final class Switchboard extends serverSwitch {
noIndexReason = TextParser.supports(response.url(), response.getMimeType());
}
// check X-YACY-Index-Control
// With the X-YACY-Index-Control header set to "no-index" a client could disallow
// yacy to index the response returned as answer to a request
@ -1583,43 +1581,43 @@ public final class Switchboard extends serverSwitch {
return new indexingQueueEntry(in.process, in.queueEntry, document, null);
}
private Document parseDocument(Response entry) throws InterruptedException {
private Document parseDocument(Response response) throws InterruptedException {
Document document = null;
final EventOrigin processCase = entry.processCase(peers.mySeed().hash);
final EventOrigin processCase = response.processCase(peers.mySeed().hash);
if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase +
", depth=" + entry.depth() +
", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().depth())) +
", must-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustMatchPattern().toString()) +
", must-not-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustNotMatchPattern().toString()) +
", initiatorHash=" + entry.initiator() +
", depth=" + response.depth() +
", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) +
", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) +
", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) +
", initiatorHash=" + response.initiator() +
//", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
", url=" + entry.url()); // DEBUG
", url=" + response.url()); // DEBUG
// PARSE CONTENT
final long parsingStartTime = System.currentTimeMillis();
byte[] b = null;
try {
// fetch the document
b = Cache.getContent(entry.url());
b = Cache.getContent(response.url());
if (b == null) {
this.log.logWarning("the resource '" + entry.url() + "' is missing in the cache.");
addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), "missing");
this.log.logWarning("the resource '" + response.url() + "' is missing in the cache.");
addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing");
return null;
}
} catch (IOException e) {
this.log.logWarning("Unable fetch the resource '" + entry.url() + "'. from the cache: " + e.getMessage());
addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage());
this.log.logWarning("Unable fetch the resource '" + response.url() + "'. from the cache: " + e.getMessage());
addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
return null;
}
try {
// parse the document
document = TextParser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), b);
document = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b);
assert(document != null) : "Unexpected error. Parser returned null.";
} catch (final ParserException e) {
this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage(), e);
addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage());
this.log.logWarning("Unable to parse the resource '" + response.url() + "'. " + e.getMessage(), e);
addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
if (document != null) {
document.close();
document = null;
@ -1630,43 +1628,48 @@ public final class Switchboard extends serverSwitch {
final long parsingEndTime = System.currentTimeMillis();
// get the document date
final Date docDate = entry.lastModified();
final Date docDate = response.lastModified();
// put anchors on crawl stack
final long stackStartTime = System.currentTimeMillis();
if (
((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) &&
((entry.profile() == null) || (entry.depth() < entry.profile().depth()))
((response.profile() == null) || (response.depth() < response.profile().depth()))
) {
// get the hyperlinks
final Map<DigestURI, String> hl = document.getHyperlinks();
final Iterator<Map.Entry<DigestURI, String>> i = hl.entrySet().iterator();
// add all images also to the crawl stack
for (ImageEntry imageReference : document.getImages().values()) {
hl.put(imageReference.url(), imageReference.alt());
}
// insert those hyperlinks to the crawler
DigestURI nextUrl;
Map.Entry<DigestURI, String> nextEntry;
while (i.hasNext()) {
for (Map.Entry<DigestURI, String> nextEntry : hl.entrySet()) {
// check for interruption
checkInterruption();
// fetching the next hyperlink
nextEntry = i.next();
// process the next hyperlink
nextUrl = nextEntry.getKey();
String u = nextUrl.toNormalform(true, true);
if (!(u.startsWith("http") || u.startsWith("ftp"))) continue;
// enqueue the hyperlink into the pre-notice-url db
crawlStacker.enqueueEntry(new Request(
entry.initiator(),
response.initiator(),
nextUrl,
entry.url().hash(),
response.url().hash(),
nextEntry.getValue(),
null,
docDate,
entry.profile().handle(),
entry.depth() + 1,
response.profile().handle(),
response.depth() + 1,
0,
0
));
}
final long stackEndTime = System.currentTimeMillis();
if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + response.url().toNormalform(false, true) +
", STACKING TIME = " + (stackEndTime-stackStartTime) +
", PARSING TIME = " + (parsingEndTime-parsingStartTime));
}
@ -1807,68 +1810,6 @@ public final class Switchboard extends serverSwitch {
}
}
// method for index deletion
public int removeAllUrlReferences(Segment indexSegment, final DigestURI url, final boolean fetchOnline) {
return removeAllUrlReferences(indexSegment, url.hash(), fetchOnline);
}
public int removeAllUrlReferences(Segment indexSegment, final String urlhash, final boolean fetchOnline) {
// find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry
if (urlhash == null) return 0;
// determine the url string
final URIMetadataRow entry = indexSegment.urlMetadata().load(urlhash, null, 0);
if (entry == null) return 0;
final URIMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) return 0;
InputStream resourceContent = null;
try {
// get the resource content
Object[] resource = null;
try {
resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
} catch (IOException e) {
Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
}
if (resource == null) {
// delete just the url entry
indexSegment.urlMetadata().remove(urlhash);
return 0;
} else {
resourceContent = (InputStream) resource[0];
final Long resourceContentLength = (Long) resource[1];
// parse the resource
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
// get the word set
Set<String> words = null;
try {
words = new Condenser(document, true, true).words().keySet();
} catch (final UnsupportedEncodingException e) {
e.printStackTrace();
}
// delete all word references
int count = 0;
if (words != null) count = indexSegment.termIndex().remove(Word.words2hashes(words), urlhash);
// finally delete the url entry itself
indexSegment.urlMetadata().remove(urlhash);
return count;
}
} catch (final ParserException e) {
return 0;
} catch (IOException e) {
e.printStackTrace();
return 0;
} finally {
if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
}
}
public int adminAuthenticated(final RequestHeader requestHeader) {
// authorization for localhost, only if flag is set to grant localhost access as admin

@ -239,13 +239,13 @@ public final class TextParser {
* check if the parser supports the given content.
* @param url
* @param mimeType
* @return returns null if the content is supportet. If the content is not supported, return a error string.
* @return returns null if the content is supported. If the content is not supported, return a error string.
*/
public static String supports(final DigestURI url, String mimeType) {
try {
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
idiomParser(url, mimeType);
return null;
List<Idiom> idioms = idiomParser(url, mimeType);
return (idioms == null || idioms.size() == 0) ? "no parser found" : null;
} catch (ParserException e) {
// in case that a parser is not available, return a error string describing the problem.
return e.getMessage();

@ -327,6 +327,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return anchors;
}
/**
* get all images
* @return a map of <urlhash, ImageEntry>
*/
public HashMap<String, ImageEntry> getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
return images;

@ -81,25 +81,26 @@ public class genericImageParser extends AbstractParser implements Idiom {
throw new ParserException(e.getMessage(), location);
}
/*
// scan the image
int height = image.getHeight();
int width = image.getWidth();
Raster raster = image.getData();
int[] pixel = raster.getPixel(0, 0, (int[])null);
long[] average = new long[]{0, 0, 0};
long[] average = new long[pixel.length];
for (int i = 0; i < average.length; i++) average[i] = 0L;
int pc = 0;
for (int x = width / 4; x < 3 * width / 4; x = x + 2) {
for (int y = height / 4; y < 3 * height / 4; y = y + 2) {
pixel = raster.getPixel(x, y, pixel);
average[0] += pixel[0];
average[1] += pixel[1];
average[2] += pixel[2];
for (int i = 0; i < average.length; i++) average[i] += pixel[i];
pc++;
}
}
*/
// get image properties
String [] propNames = image.getPropertyNames();
if (propNames == null) propNames = new String[0];
StringBuilder sb = new StringBuilder(propNames.length * 80);
for (String propName: propNames) {
sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n");

@ -40,20 +40,13 @@ import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
//StringBuilder for parsed text
private StringBuilder sbFoundStrings = null;
//sstrecord needed for event parsing
private SSTRecord sstrec;
public class xlsParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
@ -85,59 +78,7 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
public Document parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source) throws ParserException,
InterruptedException {
try {
//generate new StringBuilder for parsing
sbFoundStrings = new StringBuilder();
//create a new org.apache.poi.poifs.filesystem.Filesystem
final POIFSFileSystem poifs = new POIFSFileSystem(source);
//get the Workbook (excel part) stream in a InputStream
final InputStream din = poifs.createDocumentInputStream("Workbook");
//construct out HSSFRequest object
final HSSFRequest req = new HSSFRequest();
//lazy listen for ALL records with the listener shown above
req.addListenerForAllRecords(this);
//create our event factory
final HSSFEventFactory factory = new HSSFEventFactory();
//process our events based on the document input stream
factory.processEvents(req, din);
//close our document input stream (don't want to leak these!)
din.close();
//now the parsed strings are in the StringBuilder, now convert them to a String
final String contents = sbFoundStrings.toString().trim();
/*
* create the plasmaParserDocument for the database
* and set shortText and bodyText properly
*/
final Document theDoc = new Document(
location,
mimeType,
"UTF-8",
null,
null,
location.getFile(),
"", // TODO: AUTHOR
null,
null,
contents.getBytes("UTF-8"),
null,
null);
return theDoc;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
/*
* an unexpected error occurred, log it and throw a ParserException
*/
e.printStackTrace();
final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage();
this.theLogger.logSevere(errorMsg);
throw new ParserException(errorMsg, location);
} finally {
sbFoundStrings = null;
}
return new XLSHSSFListener().parse(location, mimeType, charset, source);
}
public Set<String> supportedMimeTypes() {
@ -153,34 +94,107 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
//nothing to do
super.reset();
}
public class XLSHSSFListener implements HSSFListener {
//StringBuilder for parsed text
private final StringBuilder sbFoundStrings;
public void processRecord(final Record record) {
switch (record.getSid()){
case NumberRecord.sid: {
final NumberRecord numrec = (NumberRecord) record;
sbFoundStrings.append(numrec.getValue());
break;
public XLSHSSFListener() {
this.sbFoundStrings = new StringBuilder(100);
}
/*
* parses the source documents and returns a Document containing
* all extracted information about the parsed document
*/
public Document parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source) throws ParserException,
InterruptedException {
try {
//create a new org.apache.poi.poifs.filesystem.Filesystem
final POIFSFileSystem poifs = new POIFSFileSystem(source);
//get the Workbook (excel part) stream in a InputStream
final InputStream din = poifs.createDocumentInputStream("Workbook");
//construct out HSSFRequest object
final HSSFRequest req = new HSSFRequest();
//lazy listen for ALL records with the listener shown above
req.addListenerForAllRecords(this);
//create our event factory
final HSSFEventFactory factory = new HSSFEventFactory();
//process our events based on the document input stream
factory.processEvents(req, din);
//close our document input stream (don't want to leak these!)
din.close();
//now the parsed strings are in the StringBuilder, now convert them to a String
final String contents = sbFoundStrings.toString().trim();
/*
* create the plasmaParserDocument for the database
* and set shortText and bodyText properly
*/
final Document theDoc = new Document(
location,
mimeType,
"UTF-8",
null,
null,
location.getFile(),
"", // TODO: AUTHOR
null,
null,
contents.getBytes("UTF-8"),
null,
null);
return theDoc;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
/*
* an unexpected error occurred, log it and throw a ParserException
*/
e.printStackTrace();
final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage();
theLogger.logSevere(errorMsg);
throw new ParserException(errorMsg, location);
}
//unique string records
case SSTRecord.sid: {
sstrec = (SSTRecord)record;
for (int k = 0; k < sstrec.getNumUniqueStrings(); k++){
sbFoundStrings.append( sstrec.getString(k) );
//add line seperator
sbFoundStrings.append( "\n" );
}
public void processRecord(final Record record) {
SSTRecord sstrec = null;
switch (record.getSid()){
case NumberRecord.sid: {
final NumberRecord numrec = (NumberRecord) record;
sbFoundStrings.append(numrec.getValue());
break;
}
break;
//unique string records
case SSTRecord.sid: {
sstrec = (SSTRecord) record;
for (int k = 0; k < sstrec.getNumUniqueStrings(); k++){
sbFoundStrings.append( sstrec.getString(k) );
//add line seperator
sbFoundStrings.append( "\n" );
}
break;
}
/*
case LabelSSTRecord.sid: {
final LabelSSTRecord lsrec = (LabelSSTRecord)record;
sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) );
break;
}
*/
}
case LabelSSTRecord.sid: {
final LabelSSTRecord lsrec = (LabelSSTRecord)record;
sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) );
break;
}
//add line seperator
sbFoundStrings.append( "\n" );
}
//add line seperator
sbFoundStrings.append( "\n" );
}
}

Loading…
Cancel
Save