*) Parser now throws an ParserException instead of returning null on parsing errors (e.g. needed by snippet fetcher)

*) better logging of parser failures
*) simplified usage of plasmaparser through switchboard
*) restructuring of crawler
   - crawler now returns an error message if it is used in sync mode (e.g. by snippet fetcher)
*) snippet-fetcher: more verbose error messages
*) serverByteBuffer.java: adding new function append(String,encoding)
*) serverFileUtils.java: adding functions to copy only a given number of bytes between streams


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2641 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent aa38721cf6
commit b6c7b91582

@ -51,11 +51,11 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import de.anomic.net.URL;
import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager;
import de.anomic.data.bookmarksDB.Tag;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;

@ -53,9 +53,11 @@ Unable to find URL Entry in DB
:: <!-- 3 -->
Invalid URL
:: <!-- 4 -->
Unable to download resource content.
Unable to download resource content.<br>
<tt>#[errorText]#</tt>
:: <!-- 5 -->
Unable to parse resource content.
Unable to parse resource content.<br>
<tt>#[errorText]#</tt>
:: <!-- 6 -->
Unsupported protocol.
#(/error)#

@ -57,6 +57,8 @@ import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.plasmaCrawlLURL.Entry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -83,174 +85,185 @@ public class ViewFile {
serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard)env;
if (post.containsKey("words"))
if (post != null && post.containsKey("words"))
try {
prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
} catch (UnsupportedEncodingException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
// ignore this. this should not occure
}
if (post != null) {
// getting the url hash from which the content should be loaded
String urlHash = post.get("urlHash","");
if (urlHash.equals("")) {
prop.put("error",1);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
String viewMode = post.get("viewMode","sentences");
// getting the urlEntry that belongs to the url hash
Entry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
if (urlEntry == null) {
prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
// getting the url hash from which the content should be loaded
String urlHash = post.get("urlHash","");
if (urlHash.equals("")) {
prop.put("error",1);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
// gettin the url that belongs to the entry
URL url = urlEntry.url();
if (url == null) {
prop.put("error",3);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
String viewMode = post.get("viewMode","sentences");
// getting the urlEntry that belongs to the url hash
Entry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
if (urlEntry == null) {
prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
// gettin the url that belongs to the entry
URL url = urlEntry.url();
if (url == null) {
prop.put("error",3);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
// loading the resource content as byte array
byte[] resource = null;
IResourceInfo resInfo = null;
String resMime = null;
try {
// trying to load the resource body
resource = sb.cacheManager.loadResourceContent(url);
// if the resource body was not cached we try to load it from web
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
} catch (plasmaCrawlerException e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
// loading the resource content as byte array
byte[] resource = null;
IResourceInfo resInfo = null;
String resMime = null;
try {
// trying to load the resource body
resource = sb.cacheManager.loadResourceContent(url);
if (entry != null) {
resInfo = entry.getDocumentInfo();
resource = sb.cacheManager.loadResourceContent(url);
}
// if the resource body was not cached we try to load it from web
if (resource == null) {
plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
}
if (entry != null) {
resInfo = entry.getDocumentInfo();
resource = sb.cacheManager.loadResourceContent(url);
// try to load resource metadata
if (resInfo == null) {
// try to load the metadata from cache
try {
resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url());
} catch (Exception e) { /* ignore this */}
// if the metadata where not cached try to load it from web
if (resInfo == null) {
String protocol = url.getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error",6);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
if (resource == null) {
httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
resMime = responseHeader.mime();
}
} else {
resMime = resInfo.getMimeType();
}
} catch (IOException e) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
if (viewMode.equals("plain")) {
String content = new String(resource);
content = content.replaceAll("<","&lt;")
.replaceAll(">","&gt;")
.replaceAll("\"","&quot;")
.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
// try to load resource metadata
if (resInfo == null) {
// try to load the metadata from cache
try {
resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url());
} catch (Exception e) { /* ignore this */}
// if the metadata where not cached try to load it from web
if (resInfo == null) {
String protocol = url.getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error",6);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
resMime = responseHeader.mime();
}
} else {
resMime = resInfo.getMimeType();
}
} catch (IOException e) {
if (url == null) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
}
if (viewMode.equals("plain")) {
String content = new String(resource);
content = content.replaceAll("<","&lt;")
.replaceAll(">","&gt;")
.replaceAll("\"","&quot;")
.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("error",0);
prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
prop.put("viewMode_plainText",content);
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
// parsing the resource content
plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resInfo);
prop.put("error",0);
prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
prop.put("viewMode_plainText",content);
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
document = sb.snippetCache.parseDocument(url, resource,resInfo);
if (document == null) {
prop.put("error",5);
prop.put("error_errorText","Unknown error");
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
resMime = document.getMimeType();
if (viewMode.equals("parsed")) {
String content = new String(document.getText());
content = wikiCode.replaceHTML(content); //added by Marc Nause
content = content.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_parsedText",content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url",url.toString());
} else {
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
String[] sentences = document.getSentences();
boolean dark = true;
for (int i=0; i < sentences.length; i++) {
String currentSentence = wikiCode.replaceHTML(sentences[i]);
// Search word highlighting
String words = post.get("words",null);
if (words != null) {
try {
words = URLDecoder.decode(words,"UTF-8");
} catch (UnsupportedEncodingException e) {}
String[] wordArray = words.substring(1,words.length()-1).split(",");
for (int j=0; j < wordArray.length; j++) {
String currentWord = wordArray[j].trim();
currentSentence = currentSentence.replaceAll(currentWord,
"<b style=\"color: black; background-color: rgb(" + highlightingColors[j%6] + ");\">" + currentWord + "</b>");
}
}
} catch (ParserException e) {
prop.put("error",5);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
resMime = document.getMimeType();
prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
prop.put("viewMode_sentences_" + i + "_text",currentSentence);
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
if (viewMode.equals("parsed")) {
String content = new String(document.getText());
content = wikiCode.replaceHTML(content); //added by Marc Nause
content = content.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_parsedText",content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url",url.toString());
} else {
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
String[] sentences = document.getSentences();
boolean dark = true;
for (int i=0; i < sentences.length; i++) {
String currentSentence = wikiCode.replaceHTML(sentences[i]);
// Search word highlighting
String words = post.get("words",null);
if (words != null) {
try {
words = URLDecoder.decode(words,"UTF-8");
} catch (UnsupportedEncodingException e) {}
String[] wordArray = words.substring(1,words.length()-1).split(",");
for (int j=0; j < wordArray.length; j++) {
String currentWord = wordArray[j].trim();
currentSentence = currentSentence.replaceAll(currentWord,
"<b style=\"color: black; background-color: rgb(" + highlightingColors[j%6] + ");\">" + currentWord + "</b>");
}
}
prop.put("viewMode_sentences",sentences.length);
}
}
prop.put("error",0);
prop.put("error_url",url.toString());
prop.put("error_hash",urlHash);
prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
prop.put("error_desc",urlEntry.descr());
prop.put("error_size",urlEntry.size());
prop.put("error_mimeType",resMime);
}
prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
prop.put("viewMode_sentences_" + i + "_text",currentSentence);
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
}
prop.put("viewMode_sentences",sentences.length);
}
}
prop.put("error",0);
prop.put("error_url",url.toString());
prop.put("error_hash",urlHash);
prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
prop.put("error_desc",urlEntry.descr());
prop.put("error_size",urlEntry.size());
prop.put("error_mimeType",resMime);
return prop;
}

@ -56,6 +56,7 @@ import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchImages;
@ -64,7 +65,6 @@ import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.net.URL;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
@ -192,13 +192,15 @@ public class yacysearch {
plasmaCrawlLURL.Entry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) {
plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true);
// create a news message
HashMap map = new HashMap();
map.put("url", urlentry.url().toNormalform().replace(',', '|'));
map.put("title", urlentry.descr().replace(',', ' '));
map.put("description", ((document == null) ? urlentry.descr() : document.getMainLongTitle()).replace(',', ' '));
map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
if (document != null) {
// create a news message
HashMap map = new HashMap();
map.put("url", urlentry.url().toNormalform().replace(',', '|'));
map.put("title", urlentry.descr().replace(',', ' '));
map.put("description", ((document == null) ? urlentry.descr() : document.getMainLongTitle()).replace(',', ' '));
map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
}
}
}

@ -93,6 +93,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
protected plasmaCrawlProfile.entry profile;
protected boolean acceptAllContent;
protected String errorMessage;
/**
* The crawler thread pool
*/
@ -186,6 +188,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
}
public void execute() {
plasmaHTCache.Entry loadedResource = null;
try {
// setting threadname
this.setName(plasmaCrawlWorker.threadBaseName + "_" + this.url);
@ -194,15 +198,23 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
init();
// loading resource
plasmaHTCache.Entry resource = load();
loadedResource = load();
} catch (IOException e) {
//throw e;
} finally {
// setting the error message (if available)
if (this.errorMessage != null) {
this.theMsg.setError(this.errorMessage);
}
// store a reference to the result in the message object
// this is e.g. needed by the snippet fetcher
this.theMsg.setResult(resource);
} catch (IOException e) {
//throw e;
} finally {
//
// Note: this is always called, even on empty results.
// Otherwise the caller will block forever
this.theMsg.setResult(loadedResource);
// signal that this worker thread has finished the job
this.done = true;
}
}
@ -256,9 +268,13 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
this.startdate = 0;
this.profile = null;
this.acceptAllContent = false;
this.errorMessage = null;
}
protected void addURLtoErrorDB(String failreason) {
protected void addURLtoErrorDB(String failreason) {
// remember error message
this.errorMessage = failreason;
// convert the referrer URL into a hash value
String referrerHash = (this.refererURLString==null)?null:indexURL.urlHash(this.refererURLString);

@ -0,0 +1,9 @@
package de.anomic.plasma.crawler;
import java.io.IOException;
public class plasmaCrawlerException extends IOException {
public plasmaCrawlerException(String errorMsg) {
super(errorMsg);
}
}

@ -49,6 +49,7 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import de.anomic.net.URL;
@ -93,12 +94,35 @@ public abstract class AbstractParser implements Parser{
this.libxDependencies = libxDependencies;
}
/**
* Check if the parser was interrupted.
* @throws InterruptedException if the parser was interrupted
*/
public static final void checkInterruption() throws InterruptedException {
Thread currentThread = Thread.currentThread();
if ((currentThread instanceof serverThread) && ((serverThread)currentThread).shutdownInProgress()) throw new InterruptedException("Shutdown in progress ...");
if (currentThread.isInterrupted()) throw new InterruptedException("Shutdown in progress ...");
}
public final File createTempFile(String name) throws IOException {
String parserClassName = this.getClass().getName();
int idx = parserClassName.lastIndexOf(".");
if (idx != -1) {
parserClassName = parserClassName.substring(idx+1);
}
// getting the file extension
idx = name.lastIndexOf("/");
String fileName = (idx != -1) ? name.substring(idx+1) : name;
idx = fileName.lastIndexOf(".");
String fileExt = (idx > -1) ? fileName.substring(idx+1) : "";
// creates the temp file
File tempFile = File.createTempFile(parserClassName + "_" + ((idx>-1)?fileName.substring(0,idx):fileName), (fileExt.length()>0)?"."+fileExt:fileExt);
return tempFile;
}
/**
* Parsing a document available as byte array.
* @param location the origin of the document
@ -119,14 +143,17 @@ public abstract class AbstractParser implements Parser{
) throws ParserException, InterruptedException {
ByteArrayInputStream contentInputStream = null;
try {
// convert the byte array into a stream
contentInputStream = new ByteArrayInputStream(source);
// parse the stream
return this.parse(location,mimeType,charset,contentInputStream);
} finally {
if (contentInputStream != null) {
try {
contentInputStream.close();
contentInputStream = null;
} catch (Exception e){}
} catch (Exception e){ /* ignore this */}
}
}
}
@ -151,12 +178,15 @@ public abstract class AbstractParser implements Parser{
) throws ParserException, InterruptedException {
BufferedInputStream contentInputStream = null;
try {
// create a stream from the file
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
// parse the stream
return this.parse(location, mimeType, charset, contentInputStream);
} catch (FileNotFoundException e) {
throw new ParserException(e.getMessage());
throw new ParserException("Unexpected error while parsing file. " + e.getMessage(),location);
} finally {
if (contentInputStream != null) try{contentInputStream.close();}catch(Exception e){}
if (contentInputStream != null) try{contentInputStream.close();}catch(Exception e){/* ignore this */}
}
}
@ -201,6 +231,6 @@ public abstract class AbstractParser implements Parser{
* Return the name of the parser
*/
public String getName() {
return parserName;
return this.parserName;
}
}

@ -44,24 +44,45 @@
package de.anomic.plasma.parser;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
public class ParserException extends Exception
{
private String errorCode = null;
private URL url = null;
private static final long serialVersionUID = 1L;
public ParserException() {
super();
}
public ParserException(String message) {
public ParserException(String message, URL url) {
this(message,url,plasmaCrawlEURL.DENIED_PARSER_ERROR);
}
public ParserException(String message, URL url, String errorCode) {
super(message);
this.errorCode = errorCode;
this.url = url;
}
public ParserException(String message, Throwable cause) {
public ParserException(String message, URL url, Throwable cause) {
this(message,url,cause,plasmaCrawlEURL.DENIED_PARSER_ERROR);
}
public ParserException(String message, URL url, Throwable cause, String errorCode) {
super(message, cause);
this.errorCode = errorCode;
this.url = url;
}
public ParserException(Throwable cause) {
super(cause);
public String getErrorCode() {
return this.errorCode;
}
public URL getURL() {
return this.url;
}
}

@ -80,7 +80,7 @@ public class bzipParser extends AbstractParser implements Parser {
public bzipParser() {
super(LIBX_DEPENDENCIES);
parserName = "Bzip 2 UNIX Compressed File Parser";
this.parserName = "Bzip 2 UNIX Compressed File Parser";
}
public Hashtable getSupportedMimeTypes() {
@ -129,7 +129,9 @@ public class bzipParser extends AbstractParser implements Parser {
return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing bzip file. " + e.getMessage(),location);
} finally {
if (tempFile != null) tempFile.delete();
}

@ -75,7 +75,7 @@ implements Parser {
public docParser() {
super(LIBX_DEPENDENCIES);
parserName = "Word Document Parser";
this.parserName = "Word Document Parser";
}
public plasmaParserDocument parse(URL location, String mimeType, String charset,
@ -99,14 +99,16 @@ implements Parser {
null,
null,
null,
contents.getBytes(),
contents.getBytes("UTF-8"),
null,
null);
return theDoc;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the doc content. " + e.getMessage());
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing doc file. " + e.getMessage(),location);
}
}

@ -76,7 +76,7 @@ public class gzipParser extends AbstractParser implements Parser {
public gzipParser() {
super(LIBX_DEPENDENCIES);
parserName = "GNU Zip Compressed Archive Parser";
this.parserName = "GNU Zip Compressed Archive Parser";
}
public Hashtable getSupportedMimeTypes() {
@ -113,7 +113,9 @@ public class gzipParser extends AbstractParser implements Parser {
return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing gzip file. " + e.getMessage(),location);
} finally {
if (tempFile != null) tempFile.delete();
}

@ -44,6 +44,7 @@
package de.anomic.plasma.parser.mimeType;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import de.anomic.net.URL;
import java.util.Collection;
@ -99,7 +100,7 @@ implements Parser {
public mimeTypeParser() {
super(LIBX_DEPENDENCIES);
parserName = "MimeType Parser";
this.parserName = "MimeType Parser";
}
public String getMimeType (File sourceFile) {
@ -142,8 +143,8 @@ implements Parser {
threadLoopDetection.put(Thread.currentThread(),new Integer(loopDepth.intValue()+1));
// deactivating the logging for jMimeMagic
Logger theLogger = Logger.getLogger("net.sf.jmimemagic");
theLogger.setLevel(Level.OFF);
Logger jmimeMagicLogger = Logger.getLogger("net.sf.jmimemagic");
jmimeMagicLogger.setLevel(Level.OFF);
Magic theMagic = new Magic();
MagicMatch match = theMagic.getMagicMatch(sourceFile);
@ -160,8 +161,8 @@ implements Parser {
}
// to avoid loops we have to test if the mimetype has changed ...
if (this.getSupportedMimeTypes().containsKey(mimeType)) return null;
if (orgMimeType.equals(mimeType)) return null;
if (this.getSupportedMimeTypes().containsKey(mimeType)) throw new ParserException("Unable to detect mimetype of resource.",location);
if (orgMimeType.equals(mimeType)) throw new ParserException("Unable to detect mimetype of resource.",location);
// check for interruption
checkInterruption();
@ -170,11 +171,13 @@ implements Parser {
plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,mimeType,charset,sourceFile);
}
return null;
throw new ParserException("Unable to detect mimetype of resource.",location);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
return null;
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while detect mimetype of resource. " + e.getMessage(),location);
} finally {
Integer loopDepth = (Integer) threadLoopDetection.get(Thread.currentThread());
if (loopDepth.intValue() <= 1) {
@ -186,14 +189,14 @@ implements Parser {
}
public plasmaParserDocument parse(URL location, String mimeType,String charset,
InputStream source) throws ParserException {
InputStream source) throws ParserException, InterruptedException {
File dstFile = null;
try {
dstFile = File.createTempFile("mimeTypeParser",".tmp");
serverFileUtils.copy(source,dstFile);
return parse(location,mimeType,charset,dstFile);
} catch (Exception e) {
return null;
} catch (IOException e) {
throw new ParserException("Unexpected error while detect mimetype of resource. " + e.getMessage(),location);
} finally {
if (dstFile != null) {dstFile.delete();}
}

@ -84,7 +84,7 @@ public class odtParser extends AbstractParser implements Parser {
public odtParser() {
super(LIBX_DEPENDENCIES);
parserName = "OASIS OpenDocument V2 Text Document Parser";
this.parserName = "OASIS OpenDocument V2 Text Document Parser";
}
public Hashtable getSupportedMimeTypes() {
@ -96,7 +96,7 @@ public class odtParser extends AbstractParser implements Parser {
try {
byte[] docContent = null;
String docDescription = null;
String docKeywords = null;
String docKeywordStr = null;
String docShortTitle = null;
String docLongTitle = null;
@ -125,7 +125,7 @@ public class odtParser extends AbstractParser implements Parser {
ODFMetaFileAnalyzer metaAnalyzer = new ODFMetaFileAnalyzer();
OpenDocumentMetadata metaData = metaAnalyzer.analyzeMetaData(zipFileEntryStream);
docDescription = metaData.getDescription();
docKeywords = metaData.getKeyword();
docKeywordStr = metaData.getKeyword();
docShortTitle = metaData.getTitle();
docLongTitle = metaData.getSubject();
@ -149,11 +149,16 @@ public class odtParser extends AbstractParser implements Parser {
}
}
// split the keywords
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
// create the parser document
return new plasmaParserDocument(
location,
mimeType,
"UTF-8",
docKeywords.split(" |,"),
docKeywords,
docShortTitle,
docLongTitle,
null,
@ -163,13 +168,13 @@ public class odtParser extends AbstractParser implements Parser {
null);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the odt content. " + e.getMessage());
} catch (Error e) {
throw new ParserException("Unable to parse the odt content. " + e.getMessage());
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing odt file. " + e.getMessage(),location);
}
}
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
File dest = null;
try {
// creating a tempfile
@ -182,9 +187,12 @@ public class odtParser extends AbstractParser implements Parser {
// parsing the content
return parse(location, mimeType, charset, dest);
} catch (Exception e) {
throw new ParserException("Unable to parse the odt document. " + e.getMessage());
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing odt file. " + e.getMessage(),location);
} finally {
if (dest != null) try { dest.delete(); } catch (Exception e){}
if (dest != null) try { dest.delete(); } catch (Exception e){/* ignore this */}
}
}

@ -78,7 +78,7 @@ public class pdfParser extends AbstractParser implements Parser {
public pdfParser() {
super(LIBX_DEPENDENCIES);
parserName = "Acrobat Portable Document Parser";
this.parserName = "Acrobat Portable Document Parser";
}
public Hashtable getSupportedMimeTypes() {
@ -98,7 +98,7 @@ public class pdfParser extends AbstractParser implements Parser {
// Logger theLogger = Logger.getLogger("org.pdfbox");
// theLogger.setLevel(Level.INFO);
String docTitle = null, docSubject = null, /*docAuthor = null,*/ docKeyWords = null;
String docTitle = null, docSubject = null, /*docAuthor = null,*/ docKeywordStr = null;
// check for interruption
checkInterruption();
@ -120,7 +120,7 @@ public class pdfParser extends AbstractParser implements Parser {
docTitle = theDocInfo.getTitle();
docSubject = theDocInfo.getSubject();
//docAuthor = theDocInfo.getAuthor();
docKeyWords = theDocInfo.getKeywords();
docKeywordStr = theDocInfo.getKeywords();
}
serverByteBuffer out = new serverByteBuffer();
@ -142,18 +142,14 @@ public class pdfParser extends AbstractParser implements Parser {
replaceAll("\t"," ");
}
/*
* public document(URL location, String mimeType,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, Map images) {
*
*/
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
"UTF-8",
docKeyWords.split(" |,"),
docKeywords,
docSubject,
docTitle,
null,
@ -166,10 +162,12 @@ public class pdfParser extends AbstractParser implements Parser {
}
catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the pdf content. " + e.getMessage(),e);
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location);
} finally {
if (theDocument != null) try { theDocument.close(); } catch (Exception e) {}
if (writer != null) try { writer.close(); } catch (Exception e) {}
if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */}
if (writer != null) try { writer.close(); } catch (Exception e) {/* ignore this */}
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
}
}

@ -84,7 +84,7 @@ public class rpmParser extends AbstractParser implements Parser {
public rpmParser() {
super(LIBX_DEPENDENCIES);
parserName = "rpm Parser";
this.parserName = "rpm Parser";
}
public Hashtable getSupportedMimeTypes() {
@ -126,12 +126,12 @@ public class rpmParser extends AbstractParser implements Parser {
// getting the next tag
DataTypeIf tag = rpmFile.getTag(headerNames[i]);
if (tag != null) {
content.append(headerNames[i])
.append(": ")
.append(tag.toString())
.append("\n");
}
if (tag == null) continue;
content.append(headerNames[i])
.append(": ")
.append(tag.toString())
.append("\n");
if (headerNames[i].equals("N")) name = tag.toString();
else if (headerNames[i].equals("SUMMARY")) summary = tag.toString();
@ -153,16 +153,18 @@ public class rpmParser extends AbstractParser implements Parser {
summary,
null,
description,
content.toString().getBytes(),
content.toString().getBytes("UTF-8"),
anchors,
null);
return theDoc;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the rpm file. " + e.getMessage());
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing rpm file. " + e.getMessage(),location);
} finally {
if (rpmFile != null) try { rpmFile.close(); } catch (Exception e) {}
if (rpmFile != null) try { rpmFile.close(); } catch (Exception e) {/* ignore this */}
}
}

@ -98,7 +98,7 @@ public class rssParser extends AbstractParser implements Parser {
public rssParser() {
super(LIBX_DEPENDENCIES);
parserName = "Rich Site Summary/Atom Feed Parser";
this.parserName = "Rich Site Summary/Atom Feed Parser";
}
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
@ -149,7 +149,7 @@ public class rssParser extends AbstractParser implements Parser {
anchors.put(itemURL.toString(),itemTitle);
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
text.append(new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))).trim()).append(' '); // TODO: this does not work for utf-8
text.append(new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))).trim().toString()).append(' ');
String itemContent = item.getElementValue("content");
if ((itemContent != null) && (itemContent.length() > 0)) {
@ -183,11 +183,6 @@ public class rssParser extends AbstractParser implements Parser {
}
}
/* (URL location, String mimeType,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, Map images)
*/
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
@ -205,7 +200,9 @@ public class rssParser extends AbstractParser implements Parser {
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the rss file. " + e.getMessage());
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing rss file." + e.getMessage(),location);
}
}

@ -77,7 +77,7 @@ implements Parser {
public rtfParser() {
super(LIBX_DEPENDENCIES);
parserName = "Rich Text Format Parser";
this.parserName = "Rich Text Format Parser";
}
public plasmaParserDocument parse(URL location, String mimeType, String charset,
@ -113,7 +113,9 @@ implements Parser {
}
catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the rdf content. " + e.getMessage());
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing rtf resource." + e.getMessage(),location);
}
}

@ -87,7 +87,7 @@ public class tarParser extends AbstractParser implements Parser {
public tarParser() {
super(LIBX_DEPENDENCIES);
parserName = "Tape Archive File Parser";
this.parserName = "Tape Archive File Parser";
}
public Hashtable getSupportedMimeTypes() {
@ -128,12 +128,11 @@ public class tarParser extends AbstractParser implements Parser {
// skip directories
if (entry.isDirectory()) continue;
// Get the entry name
int idx = -1;
// Get the short entry name
String entryName = entry.getName();
idx = entryName.lastIndexOf("/");
if (idx != -1) entryName = entryName.substring(idx+1);
idx = entryName.lastIndexOf(".");
// getting the entry file extension
int idx = entryName.lastIndexOf(".");
String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// trying to determine the mimeType per file extension
@ -143,19 +142,21 @@ public class tarParser extends AbstractParser implements Parser {
plasmaParserDocument theDoc = null;
File tempFile = null;
try {
byte[] buf = new byte[(int) entry.getSize()];
/*int bytesRead =*/ tin.read(buf);
tempFile = File.createTempFile("tarParser_" + ((idx>-1)?entryName.substring(0,idx):entryName), (entryExt.length()>0)?"."+entryExt:entryExt);
serverFileUtils.write(buf, tempFile);
// create the temp file
tempFile = createTempFile(entryName);
// copy the data into the file
serverFileUtils.copy(tin,tempFile,entry.getSize());
// check for interruption
checkInterruption();
// parsing the content
theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile);
theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,tempFile);
} catch (ParserException e) {
this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getErrorCode());
} finally {
if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){}
if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
}
if (theDoc == null) continue;
@ -200,7 +201,9 @@ public class tarParser extends AbstractParser implements Parser {
docImages);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the zip content. " + e.getMessage());
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing tar resource. " + e.getMessage(),location);
}
}

@ -215,7 +215,7 @@ public class vcfParser extends AbstractParser implements Parser {
URL newURL = new URL(value);
anchors.put(newURL.toString(),newURL.toString());
//parsedData.put(key,value);
} catch (MalformedURLException ex) {}
} catch (MalformedURLException ex) {/* ignore this */}
} else if (
!key.equalsIgnoreCase("BEGIN") &&
!key.equalsIgnoreCase("END") &&
@ -255,12 +255,10 @@ public class vcfParser extends AbstractParser implements Parser {
return theDoc;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
String errorMsg = "Unable to parse the vcard content. " + e.getMessage();
this.theLogger.logSevere(errorMsg);
throw new ParserException(errorMsg);
} finally {
}
throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),location);
}
}
public void reset() {

@ -43,9 +43,8 @@
package de.anomic.plasma.parser.zip;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.InputStream;
import de.anomic.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
@ -55,12 +54,14 @@ import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverFileUtils;
public class zipParser extends AbstractParser implements Parser {
@ -84,7 +85,7 @@ public class zipParser extends AbstractParser implements Parser {
public zipParser() {
super(LIBX_DEPENDENCIES);
parserName = "Compressed Archive File Parser";
this.parserName = "Compressed Archive File Parser";
}
public Hashtable getSupportedMimeTypes() {
@ -110,29 +111,39 @@ public class zipParser extends AbstractParser implements Parser {
ZipEntry entry;
ZipInputStream zippedContent = new ZipInputStream(source);
while ((entry = zippedContent.getNextEntry()) !=null) {
// check for interruption
checkInterruption();
// skip directories
if (entry.isDirectory()) continue;
// Get the entry name
String entryName = entry.getName();
int idx = entryName.lastIndexOf(".");
String entryExt = (idx > -1) ? entryName.substring(idx+1) : null;
// trying to determine the mimeType per file extension
String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
// getting the entry content
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] buf = new byte[(int) entry.getSize()];
/*int bytesRead =*/ zippedContent.read(buf);
bos.write(buf);
byte[] ut = bos.toByteArray();
// getting the file extension
String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// check for interruption
checkInterruption();
// trying to determine the mimeType per file extension
String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
// parsing the content
plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,null, ut);
plasmaParserDocument theDoc = null;
File tempFile = null;
try {
// create the temp file
tempFile = createTempFile(entryName);
// copy the data into the file
serverFileUtils.copy(zippedContent,tempFile,entry.getSize());
// parsing the zip file entry
theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, tempFile);
} catch (ParserException e) {
this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getErrorCode());
} finally {
if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
}
if (theDoc == null) continue;
// merging all documents together
@ -157,11 +168,7 @@ public class zipParser extends AbstractParser implements Parser {
docImages.addAll(theDoc.getImages());
}
/* (URL location, String mimeType,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, Map images)
*/
return new plasmaParserDocument(
location,
mimeType,
@ -176,9 +183,9 @@ public class zipParser extends AbstractParser implements Parser {
docImages);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the zip content. " + e.getMessage());
} catch (Error e) {
throw new ParserException("Unable to parse the zip content. " + e.getMessage());
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing zip resource. " + e.getMessage(),location);
}
}

@ -52,6 +52,7 @@ import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.net.URL;
import de.anomic.plasma.crawler.plasmaCrawlWorker;
import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.crawler.plasmaCrawlerFactory;
import de.anomic.plasma.crawler.plasmaCrawlerMsgQueue;
import de.anomic.plasma.crawler.plasmaCrawlerPool;
@ -83,7 +84,7 @@ public final class plasmaCrawlLoader extends Thread {
// supported protocols
// TODO: change this, e.g. by loading settings from file
this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https" /* ,"ftp" */}));
this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https"/* ,"ftp" */}));
// configuring the crawler messagequeue
this.theQueue = new plasmaCrawlerMsgQueue();
@ -99,6 +100,8 @@ public final class plasmaCrawlLoader extends Thread {
// The maximum number of idle connections connections in the pool
// 0 = no limit.
this.crawlerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawler.MaxIdleThreads","7"));
// minIdle configuration not possible for keyedObjectPools
//this.crawlerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawler.MinIdleThreads","5"));
// block undefinitely
@ -216,7 +219,7 @@ public final class plasmaCrawlLoader extends Thread {
int depth,
plasmaCrawlProfile.entry profile,
int timeout
) {
) throws plasmaCrawlerException {
plasmaHTCache.Entry result = null;
if (!this.crawlwerPool.isClosed) {
@ -241,11 +244,17 @@ public final class plasmaCrawlLoader extends Thread {
this.execute(theMsg);
// wait for the crawl job result
result = theMsg.waitForResult();
result = theMsg.waitForResult();
} catch (Exception e) {
this.log.logSevere("plasmaCrawlLoader.loadSync", e);
this.log.logSevere("plasmaCrawlLoader.loadSync: Unexpected error", e);
throw new plasmaCrawlerException("Unexpected error: " + e.getMessage());
}
// check if an error has occured
if (result == null) {
String errorMsg = theMsg.getError();
throw new plasmaCrawlerException(errorMsg);
}
}
// return the result

@ -59,6 +59,7 @@ public final class plasmaCrawlLoaderMessage {
private serverSemaphore resultSync = null;
private plasmaHTCache.Entry result;
private String errorMessage;
// loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) {
public plasmaCrawlLoaderMessage(
@ -86,6 +87,14 @@ public final class plasmaCrawlLoaderMessage {
this.result = null;
}
public void setError(String errorMessage) {
this.errorMessage = errorMessage;
}
public String getError() {
return this.errorMessage;
}
public void setResult(plasmaHTCache.Entry theResult) {
// store the result
this.result = theResult;

@ -73,12 +73,14 @@ import de.anomic.htmlFilter.htmlFilterInputStream;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.index.indexURL;
import de.anomic.net.URL;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.parser.ParserInfo;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
public final class plasmaParser {
public static final String PARSER_MODE_PROXY = "PROXY";
@ -407,7 +409,7 @@ public final class plasmaParser {
if (neededLibx != null) {
for (int libxId=0; libxId < neededLibx.length; libxId++) {
if (javaClassPath.indexOf(neededLibx[libxId]) == -1) {
throw new ParserException("Missing dependency detected: '" + neededLibx[libxId] + "'.");
throw new Exception("Missing dependency detected: '" + neededLibx[libxId] + "'.");
}
neededLibxBuf.append(neededLibx[libxId])
.append(",");
@ -464,42 +466,67 @@ public final class plasmaParser {
// closing the parser object pool
try {
theParserPool.close();
} catch (Exception e) { }
} catch (Exception e) {/* ignore this */}
}
public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) throws InterruptedException {
public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source)
throws InterruptedException, ParserException {
File tempFile = null;
try {
// creating a temp file to store the byte array
tempFile = File.createTempFile("parseSource", ".tmp");
serverFileUtils.write(source, tempFile);
// parsing the temp file
return parseSource(location, mimeType, charset, tempFile);
} catch (Exception e) {
// Interrupted- and Parser-Exceptions should pass through
if (e instanceof InterruptedException) throw (InterruptedException) e;
serverLog.logSevere("PARSER", "parseSource1: " + e.getMessage(), e);
return null;
if (e instanceof ParserException) throw (ParserException) e;
// log unexpected error
this.theLogger.logSevere("Unexpected exception in parseSource1: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location,location, e);
} finally {
if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){}
if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){/* ignore this */}
}
}
public plasmaParserDocument parseSource(URL location, String mimeType, String documentCharset, File sourceFile) throws InterruptedException {
public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile)
throws InterruptedException, ParserException {
Parser theParser = null;
String mimeType = null;
try {
// getting the mimetype of the document
mimeType = getRealMimeType(mimeType);
mimeType = getRealMimeType(theMimeType);
// getting the file extension of the document
String fileExt = getFileExt(location);
// getting the charset of the document
if (documentCharset == null)
// TODO: do a charset detection here ....
documentCharset = "ISO-8859-1";
// TODO: do a charset detection here ....
String documentCharset = (theDocumentCharset == null) ? "ISO-8859-1" : theDocumentCharset;
// testing if parsing is supported for this resource
if (!plasmaParser.supportedContent(location,mimeType)) {
String errorMsg = "No parser available to parse mimetype";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
}
// testing if the resource is not empty
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
String errorMsg = "No resource content available.";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
}
if (this.theLogger.isFine())
this.theLogger.logFine("Parsing " + location + " with mimeType '" + mimeType +
this.theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType +
"' and file extension '" + fileExt + "'.");
/*
@ -555,26 +582,43 @@ public final class plasmaParser {
theParser = this.getParser(mimeType);
// if a parser was found we use it ...
plasmaParserDocument doc = null;
if (theParser != null) {
return theParser.parse(location, mimeType,documentCharset,sourceFile);
doc = theParser.parse(location, mimeType,documentCharset,sourceFile);
} else if (realtimeParsableMimeTypesContains(mimeType)) {
return parseHtml(location, mimeType, documentCharset, sourceFile);
doc = parseHtml(location, mimeType, documentCharset, sourceFile);
} else {
serverLog.logWarning("PARSER", "parseSource2: wrong mime type");
return null;
String errorMsg = "No parser available to parse mimetype";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
}
// check result
if (doc == null) {
String errorMsg = "Unexpected error. Parser returned null.";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location);
}
return doc;
} catch (Exception e) {
// Interrupted- and Parser-Exceptions should pass through
if (e instanceof InterruptedException) throw (InterruptedException) e;
serverLog.logSevere("PARSER", "parseSource2: " + e.getMessage(), e);
return null;
if (e instanceof ParserException) throw (ParserException) e;
// log unexpected error
String errorMsg = "Unexpected exception. " + e.getMessage();
this.theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
throw new ParserException(errorMsg,location,e);
} finally {
if (theParser != null) {
try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { }
try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { /* ignore this */}
}
}
}
private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException {
private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException, ParserException {
// ...otherwise we make a scraper and transformer
FileInputStream fileIn = new FileInputStream(sourceFile);
@ -596,8 +640,9 @@ public final class plasmaParser {
//serverFileUtils.copy(sourceFile, hfos);
//hfos.close();
if (writer.binarySuspect()) {
this.theLogger.logInfo("Binary data found in URL " + location);
return null;
String errorMsg = "Binary data found in resource";
this.theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location);
}
return transformScraper(location, mimeType, documentCharset, scraper);
}

@ -43,6 +43,8 @@ package de.anomic.plasma;
import java.net.MalformedURLException;
import de.anomic.net.URL;
import de.anomic.plasma.parser.ParserException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
@ -60,10 +62,16 @@ public final class plasmaSearchImages {
if (maxTime > 10) {
byte[] res = sc.getResource(url, true, (int) maxTime);
if (res != null) {
plasmaParserDocument document = sc.parseDocument(url, res);
plasmaParserDocument document = null;
try {
document = sc.parseDocument(url, res);
} catch (ParserException e) {
// parsing failed
}
if (document == null) return;
// add the image links
if (document != null) this.addAll(document.getImages());
this.addAll(document.getImages());
// add also links from pages one step deeper, if depth > 0
if (depth > 0) {

@ -45,6 +45,8 @@ package de.anomic.plasma;
import java.io.IOException;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.parser.ParserException;
import java.util.Enumeration;
import java.util.HashMap;
@ -164,30 +166,51 @@ public class plasmaSnippetCache {
return new Snippet(line, source, null);
}
/* ===========================================================================
* LOADING RESOURCE DATA
* =========================================================================== */
// if the snippet is not in the cache, we can try to get it from the htcache
byte[] resource = null;
IResourceInfo docInfo = null;
try {
// trying to load the resource from the cache
resource = this.cacheManager.loadResourceContent(url);
if ((fetchOnline) && (resource == null)) {
docInfo = this.cacheManager.loadResourceInfo(url);
// if not found try to download it
if ((resource == null) && (fetchOnline)) {
// download resource using the crawler
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
docInfo = entry.getDocumentInfo();
}
// now the resource should be stored in the cache, load body
resource = this.cacheManager.loadResourceContent(url);
if (resource == null) {
//System.out.println("cannot load document for URL " + url);
return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL");
}
source = SOURCE_WEB;
}
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource from web: " + e.getMessage());
}
if (resource == null) {
//System.out.println("cannot load document for URL " + url);
return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL");
}
plasmaParserDocument document = parseDocument(url, resource, docInfo);
/* ===========================================================================
* PARSING RESOURCE
* =========================================================================== */
plasmaParserDocument document = null;
try {
document = parseDocument(url, resource, docInfo);
} catch (ParserException e) {
return new Snippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
}
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for URL " + url);
String[] sentences = document.getSentences();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
@ -196,6 +219,9 @@ public class plasmaSnippetCache {
return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
}
/* ===========================================================================
* COMPUTE SNIPPET
* =========================================================================== */
// we have found a parseable non-empty file: use the lines
line = computeSnippet(sentences, queryhashes, 8 + 6 * queryhashes.size(), snippetMaxLength);
//System.out.println("loaded snippet for URL " + url + ": " + line);
@ -207,22 +233,48 @@ public class plasmaSnippetCache {
return new Snippet(line, source, null);
}
/**
* Tries to load and parse a resource specified by it's URL.
* If the resource is not stored in cache and if fetchOnline is set the
* this function tries to download the resource from web.
*
* @param url the URL of the resource
* @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache
* @return the parsed document as {@link plasmaParserDocument}
*/
public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) {
byte[] resource = null;
IResourceInfo docInfo = null;
try {
// trying to load the resource body from cache
resource = this.cacheManager.loadResourceContent(url);
// if not available try to load resource from web
if ((fetchOnline) && (resource == null)) {
// download resource using crawler
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
// fetching metadata of the resource (e.g. http headers for http resource)
if (entry != null) docInfo = entry.getDocumentInfo();
// getting the resource body from the cache
resource = this.cacheManager.loadResourceContent(url);
} else {
// trying to load resource metadata
docInfo = this.cacheManager.loadResourceInfo(url);
}
} catch (IOException e) {
e.printStackTrace();
// parsing document
if (resource == null) return null;
return parseDocument(url, resource, docInfo);
} catch (ParserException e) {
this.log.logWarning("Unable to parse resource. " + e.getMessage());
return null;
} catch (Exception e) {
this.log.logWarning("Unexpected error while retrieving document. " + e.getMessage(),e);
return null;
}
if (resource == null) return null;
return parseDocument(url, resource, docInfo);
}
public void storeToCache(String wordhashes, String urlhash, String snippet) {
@ -374,11 +426,11 @@ public class plasmaSnippetCache {
return map;
}
public plasmaParserDocument parseDocument(URL url, byte[] resource) {
public plasmaParserDocument parseDocument(URL url, byte[] resource) throws ParserException {
return parseDocument(url, resource, null);
}
public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) {
public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) throws ParserException {
try {
if (resource == null) return null;
@ -425,9 +477,15 @@ public class plasmaSnippetCache {
public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
// load the url as resource from the web
try {
// trying to load the resource body from cache
byte[] resource = cacheManager.loadResourceContent(url);
// if the content is not available in cache try to download it from web
if ((fetchOnline) && (resource == null)) {
// try to download the resource using a crawler
loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout);
// get the content from cache
resource = cacheManager.loadResourceContent(url);
}
return resource;
@ -436,7 +494,7 @@ public class plasmaSnippetCache {
}
}
public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws plasmaCrawlerException {
plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync(
url,

@ -144,6 +144,7 @@ import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroMapTable;
import de.anomic.plasma.dbImport.dbImportManager;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverCodings;
@ -1392,7 +1393,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException {
private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException, ParserException {
plasmaParserDocument document = null;
// the mimetype of this entry
@ -1402,29 +1403,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// the parser logger
serverLog parserLogger = parser.getLogger();
// if the document content is supported we can start to parse the content
if (plasmaParser.supportedContent(
entry.url(),
mimeType)
){
if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) {
parserLogger.logFine("'" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File");
document = parser.parseSource(entry.url(), mimeType, charset, entry.cacheFile());
} else {
parserLogger.logFine("'" + entry.normalizedURLString() + "' cannot be parsed, no resource available");
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength));
}
if (document == null) {
parserLogger.logSevere("'" + entry.normalizedURLString() + "' parse failure");
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_PARSER_ERROR, new bitfield(indexURL.urlFlagLength));
}
} else {
parserLogger.logFine("'" + entry.normalizedURLString() + "'. Unsupported mimeType '" + ((mimeType == null) ? "null" : mimeType) + "'.");
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT, new bitfield(indexURL.urlFlagLength));
}
checkInterruption();
return document;
// parse the document
return parseResource(entry.url(), mimeType, charset, entry.cacheFile());
}
public plasmaParserDocument parseResource(URL location, String mimeType, String documentCharset, File sourceFile) throws InterruptedException, ParserException {
plasmaParserDocument doc = parser.parseSource(location, mimeType, documentCharset, sourceFile);
assert(doc != null) : "Unexpected error. Parser returned null.";
return doc;
}
private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
@ -1471,8 +1457,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaParserDocument document = null;
parsingStartTime = System.currentTimeMillis();
try {
document = this.parseResource(entry, initiatorPeerHash);
if (document == null) return;
} catch (ParserException e) {
this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexURL.urlFlagLength));
return;
}
parsingEndTime = System.currentTimeMillis();
@ -2172,16 +2164,22 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// determine the url string
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0;
URL url = entry.url();
if (url == null) return 0;
// get set of words
// Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
// delete all word references
int count = removeReferences(urlhash, witer);
// finally delete the url entry itself
urlPool.loadedURL.remove(urlhash);
return count;
try {
// get set of words
// Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
// delete all word references
int count = removeReferences(urlhash, witer);
// finally delete the url entry itself
urlPool.loadedURL.remove(urlhash);
return count;
} catch (ParserException e) {
return 0;
}
}
public int removeReferences(URL url, Set words) {

@ -188,6 +188,10 @@ public final class serverByteBuffer extends OutputStream {
public serverByteBuffer append(String s) {
return append(s.getBytes());
}
public serverByteBuffer append(String s, String charset) throws UnsupportedEncodingException {
return append(s.getBytes(charset));
}
public serverByteBuffer append(serverByteBuffer bb) {
return append(bb.buffer, bb.offset, bb.length);

@ -73,24 +73,39 @@ import de.anomic.kelondro.kelondroRowSet;
public final class serverFileUtils {
private static final int DEFAULT_BUFFER_SIZE = 4096;
public static long copy(InputStream source, OutputStream dest) throws IOException {
return copy(source,dest);
}
/**
* Copies an InputStream to an OutputStream.
* @param source InputStream
* @param dest OutputStream
* @param source InputStream
* @param dest OutputStream
* @param count the total amount of bytes to copy
* @return Total number of bytes copied.
*
* @see copy(InputStream source, File dest)
* @see copyRange(File source, OutputStream dest, int start)
* @see copy(File source, OutputStream dest)
* @see copy(File source, File dest)
*/
public static int copy(InputStream source, OutputStream dest) throws IOException {
byte[] buffer = new byte[4096];
public static long copy(InputStream source, OutputStream dest, long count) throws IOException {
byte[] buffer = new byte[DEFAULT_BUFFER_SIZE];
int chunkSize = (int) ((count > 0) ? Math.min(count, DEFAULT_BUFFER_SIZE) : DEFAULT_BUFFER_SIZE);
int c, total = 0;
while ((c = source.read(buffer)) > 0) {
int c; long total = 0;
while ((c = source.read(buffer,0,chunkSize)) > 0) {
dest.write(buffer, 0, c);
dest.flush();
total += c;
if (count > 0) {
chunkSize = (int)Math.min(count-total,DEFAULT_BUFFER_SIZE);
if (chunkSize == 0) break;
}
}
dest.flush();
@ -165,21 +180,26 @@ public final class serverFileUtils {
}
return count;
}
public static void copy(InputStream source, File dest) throws IOException {
copy(source,dest,-1);
}
/**
* Copies an InputStream to a File.
* @param source InputStream
* @param dest File
* @param the amount of bytes to copy
* @see copy(InputStream source, OutputStream dest)
* @see copyRange(File source, OutputStream dest, int start)
* @see copy(File source, OutputStream dest)
* @see copy(File source, File dest)
*/
public static void copy(InputStream source, File dest) throws IOException {
public static void copy(InputStream source, File dest, long count) throws IOException {
FileOutputStream fos = null;
try {
fos = new FileOutputStream(dest);
copy(source, fos);
copy(source, fos, count);
} finally {
if (fos != null) try {fos.close();} catch (Exception e) {}
}
@ -201,7 +221,7 @@ public final class serverFileUtils {
fis = new FileInputStream(source);
long skipped = fis.skip(start);
if (skipped != start) throw new IllegalStateException("Unable to skip '" + start + "' bytes. Only '" + skipped + "' bytes skipped.");
copy(fis, dest);
copy(fis, dest,-1);
} finally {
if (fis != null) try { fis.close(); } catch (Exception e) {}
}
@ -220,28 +240,33 @@ public final class serverFileUtils {
InputStream fis = null;
try {
fis = new FileInputStream(source);
copy(fis, dest);
copy(fis, dest, -1);
} finally {
if (fis != null) try { fis.close(); } catch (Exception e) {}
}
}
public static void copy(File source, File dest) throws IOException {
copy(source,dest,-1);
}
/**
* Copies a File to a File.
* @param source File
* @param dest File
* @param count the amount of bytes to copy
* @see copy(InputStream source, OutputStream dest)
* @see copy(InputStream source, File dest)
* @see copyRange(File source, OutputStream dest, int start)
* @see copy(File source, OutputStream dest)
*/
public static void copy(File source, File dest) throws IOException {
public static void copy(File source, File dest, long count) throws IOException {
FileInputStream fis = null;
FileOutputStream fos = null;
try {
fis = new FileInputStream(source);
fos = new FileOutputStream(dest);
copy(fis, fos);
copy(fis, fos, count);
} finally {
if (fis != null) try {fis.close();} catch (Exception e) {}
if (fos != null) try {fos.close();} catch (Exception e) {}
@ -250,7 +275,7 @@ public final class serverFileUtils {
public static byte[] read(InputStream source) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
copy(source, baos);
copy(source, baos, -1);
baos.close();
return baos.toByteArray();
}
@ -309,7 +334,7 @@ public final class serverFileUtils {
}
public static void write(byte[] source, OutputStream dest) throws IOException {
copy(new ByteArrayInputStream(source), dest);
copy(new ByteArrayInputStream(source), dest, -1);
}
public static void write(byte[] source, File dest) throws IOException {

Loading…
Cancel
Save