added new configuration property "crawler.embedLinksAsDocuments". If this is switched on (this is default now), the all embedded image, audio and video links from all parsed documents are added to the search index as individual document. This will increase the search index size dramatically but will also enable us to create a much faster image, audio and video search. If the flag is switched on, the index entries are also stored to a solr index, if this is also enabled.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7931 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent e02bfbde56
commit 49e5ca579f

@ -704,6 +704,10 @@ crawler.file.maxFileSize=100000000
# maximum number of crawler threads
crawler.MaxActiveThreads = 200
# flag: consider all embedded image/audio/video document links
# from all crawled documents as its own document
crawler.embedLinksAsDocuments = true
# maximum size of indexing queue
indexer.slots = 100

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -43,7 +43,7 @@ import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.ResultURLs.EventOrigin;
public class Response {
// doctypes:
public static final char DT_PDFPS = 'p';
public static final char DT_TEXT = 't';
@ -65,7 +65,7 @@ public class Response {
private final CrawlProfile profile;
private byte[] content;
private int status; // tracker indexing status, see status defs below
// doctype calculation
public static char docType(final DigestURI url) {
final String path = url.getPath().toLowerCase();
@ -136,14 +136,14 @@ public class Response {
//zip = application/zip
return doctype;
}
public static final int QUEUE_STATE_FRESH = 0;
public static final int QUEUE_STATE_PARSING = 1;
public static final int QUEUE_STATE_CONDENSING = 2;
public static final int QUEUE_STATE_STRUCTUREANALYSIS = 3;
public static final int QUEUE_STATE_INDEXSTORAGE = 4;
public static final int QUEUE_STATE_FINISHED = 5;
public Response(
final Request request,
final RequestHeader requestHeader,
@ -160,7 +160,7 @@ public class Response {
this.status = QUEUE_STATE_FRESH;
this.content = content;
}
public Response(final Request request, final CrawlProfile profile) {
this.request = request;
// request and response headers may be zero in case that we process surrogates
@ -172,7 +172,7 @@ public class Response {
this.status = QUEUE_STATE_FRESH;
this.content = request.url().toTokens().getBytes();
}
public Response(
final Request request,
final RequestHeader requestHeader,
@ -185,15 +185,15 @@ public class Response {
public void updateStatus(final int newStatus) {
this.status = newStatus;
}
public ResponseHeader getResponseHeader() {
return this.responseHeader;
}
public int getStatus() {
return this.status;
}
public String name() {
// the anchor name; can be either the text inside the anchor tag or the
// page description after loading of the page
@ -203,7 +203,7 @@ public class Response {
public DigestURI url() {
return this.request.url();
}
public char docType() {
char doctype = docType(getMimeType());
if (doctype == DT_UNKNOWN) doctype = docType(url());
@ -212,21 +212,21 @@ public class Response {
public Date lastModified() {
Date docDate = null;
if (responseHeader != null) {
docDate = responseHeader.lastModified();
if (docDate == null) docDate = responseHeader.date();
if (this.responseHeader != null) {
docDate = this.responseHeader.lastModified();
if (docDate == null) docDate = this.responseHeader.date();
}
if (docDate == null && request != null) docDate = request.appdate();
if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime());
if (docDate == null && this.request != null) docDate = this.request.appdate();
if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime());
return docDate;
}
public String language() {
// please avoid this method if a condenser document is available, because the condenser has a built-in language detection
// this here is only a guess using the TLD
return this.url().language();
return url().language();
}
public CrawlProfile profile() {
@ -272,9 +272,9 @@ public class Response {
*/
public String shallStoreCacheForProxy() {
String crawlerReason = shallStoreCacheForCrawler();
final String crawlerReason = shallStoreCacheForCrawler();
if (crawlerReason != null) return crawlerReason;
// check profile (disabled: we will check this in the plasmaSwitchboard)
// if (!this.profile.storeHTCache()) { return "storage_not_wanted"; }
@ -285,19 +285,19 @@ public class Response {
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable
// in caches
if (this.url().isPOST() && this.profile != null && !this.profile.crawlingQ()) {
if (url().isPOST() && this.profile != null && !this.profile.crawlingQ()) {
return "dynamic_post";
}
if (this.url().isCGI()) {
if (url().isCGI()) {
return "dynamic_cgi";
}
if (this.url().isLocal()) {
if (url().isLocal()) {
return "local_URL_no_cache_needed";
}
if (responseHeader != null) {
if (this.responseHeader != null) {
// -if-modified-since in request
// we do not care about if-modified-since, because this case only occurres if the
@ -315,7 +315,7 @@ public class Response {
// -pragma in response
// if we have a pragma non-cache, we don't cache. usually if this is wanted from
// the server, it makes sense
String cacheControl = responseHeader.get(HeaderFramework.PRAGMA);
String cacheControl = this.responseHeader.get(HeaderFramework.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; }
// -expires in response
@ -324,12 +324,12 @@ public class Response {
// -cache-control in response
// the cache-control has many value options.
cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL);
cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
final Date date = responseHeader.date();
final Date date = this.responseHeader.date();
if (date == null) return "stale_no_date_given_in_response";
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
@ -349,35 +349,35 @@ public class Response {
public String shallStoreCacheForCrawler() {
// check storage size: all files will be handled in RAM before storage, so they must not exceed
// a given size, which we consider as 1MB
if (this.size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + this.size();
if (size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + size();
// check status code
if (!validResponseStatus()) {
return "bad_status_" + this.responseStatus;
}
if (requestHeader != null) {
if (this.requestHeader != null) {
// -authorization cases in request
// authorization makes pages very individual, and therefore we cannot use the
// content in the cache
if (requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return "personalized"; }
if (this.requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return "personalized"; }
// -ranges in request and response
// we do not cache partial content
if (requestHeader.containsKey(HeaderFramework.RANGE)) { return "partial_request"; }
if (this.requestHeader.containsKey(HeaderFramework.RANGE)) { return "partial_request"; }
}
if (responseHeader != null) {
if (this.responseHeader != null) {
// -ranges in request and response
// we do not cache partial content
if (responseHeader.containsKey(HeaderFramework.CONTENT_RANGE)) { return "partial_response"; }
// we do not cache partial content
if (this.responseHeader.containsKey(HeaderFramework.CONTENT_RANGE)) { return "partial_response"; }
}
return null;
}
/**
* decide upon header information if a specific file should be taken from
* the cache or not
*
*
* @return whether the file should be taken from the cache
*/
public boolean isFreshForProxy() {
@ -385,27 +385,27 @@ public class Response {
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable
// in caches
if (this.url().isPOST()) {
if (url().isPOST()) {
return false;
}
if (this.url().isCGI()) {
if (url().isCGI()) {
return false;
}
String cacheControl;
if (requestHeader != null) {
if (this.requestHeader != null) {
// -authorization cases in request
if (requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return false; }
if (this.requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return false; }
// -ranges in request
// we do not cache partial content
if (requestHeader.containsKey(HeaderFramework.RANGE)) { return false; }
if (this.requestHeader.containsKey(HeaderFramework.RANGE)) { return false; }
// if the client requests a un-cached copy of the resource ...
cacheControl = requestHeader.get(HeaderFramework.PRAGMA);
cacheControl = this.requestHeader.get(HeaderFramework.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
cacheControl = requestHeader.get(HeaderFramework.CACHE_CONTROL);
cacheControl = this.requestHeader.get(HeaderFramework.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; }
@ -414,14 +414,14 @@ public class Response {
// -if-modified-since in request
// The entity has to be transferred only if it has
// been modified since the date given by the If-Modified-Since header.
if (requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) {
if (this.requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if (!responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; }
if (!this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; }
// parse date
Date d1, d2;
d2 = responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); }
d1 = requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); }
d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); }
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); }
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) { return false; }
}
@ -433,48 +433,48 @@ public class Response {
// but we think that pictures can still be considered as fresh
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if (requestHeader.containsKey(RequestHeader.COOKIE) ||
responseHeader.containsKey(HeaderFramework.SET_COOKIE) ||
responseHeader.containsKey(HeaderFramework.SET_COOKIE2)) {
if (this.requestHeader.containsKey(RequestHeader.COOKIE) ||
this.responseHeader.containsKey(HeaderFramework.SET_COOKIE) ||
this.responseHeader.containsKey(HeaderFramework.SET_COOKIE2)) {
return false; // too strong
}
}
}
if (responseHeader != null) {
if (this.responseHeader != null) {
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
cacheControl = responseHeader.get(HeaderFramework.PRAGMA);
cacheControl = this.responseHeader.get(HeaderFramework.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// http://vancouver-webpages.com/CacheNow/
// look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
final Date expires = responseHeader.expires();
final Date expires = this.responseHeader.expires();
if (expires != null) {
// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
if (expires.before(new Date(GenericFormatter.correctedUTCTime()))) { return false; }
}
final Date lastModified = responseHeader.lastModified();
cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL);
final Date lastModified = this.responseHeader.lastModified();
cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL);
if (cacheControl == null && lastModified == null && expires == null) { return false; }
// -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
// of the file and the last modified date as the age of the file. If we consider the file as
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
Date date = responseHeader.date();
Date date = this.responseHeader.date();
if (lastModified != null) {
if (date == null) { date = new Date(GenericFormatter.correctedUTCTime()); }
final long age = date.getTime() - lastModified.getTime();
@ -484,7 +484,7 @@ public class Response {
// therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
if (GenericFormatter.correctedUTCTime() - date.getTime() > age / 10) { return false; }
}
// -cache-control in cached response
// the cache-control has many value options.
if (cacheControl != null) {
@ -510,17 +510,17 @@ public class Response {
}
}
}
return true;
}
/**
* decide upon header information if a specific file should be indexed
* this method returns null if the answer is 'YES'!
* if the answer is 'NO' (do not index), it returns a string with the reason
* to reject the crawling demand in clear text
*
*
* This function is used by plasmaSwitchboard#processResourceStack
*/
public final String shallIndexCacheForProxy() {
@ -530,7 +530,7 @@ public class Response {
// check profile
if (!profile().indexText() && !profile().indexMedia()) {
return "indexing not allowed - indexText and indexMedia not set (for proxy = " + profile.name()+ ")";
return "indexing not allowed - indexText and indexMedia not set (for proxy = " + this.profile.name()+ ")";
}
// -CGI access in request
@ -556,7 +556,7 @@ public class Response {
return "Media_Content_(forbidden)";
}
*/
// -cookies in request
// unfortunately, we cannot index pages which have been requested with a cookie
// because the returned content may be special for the client
@ -565,19 +565,19 @@ public class Response {
return "Dynamic_(Requested_With_Cookie)";
}
if (responseHeader != null) {
if (this.responseHeader != null) {
// -set-cookie in response
// the set-cookie from the server does not indicate that the content is special
// thus we do not care about it here for indexing
// thus we do not care about it here for indexing
// a picture cannot be indexed
final String mimeType = responseHeader.mime();
final String mimeType = this.responseHeader.mime();
/*
if (Classification.isPictureMime(mimeType)) {
return "Media_Content_(Picture)";
}
*/
String parserError = TextParser.supportsMime(mimeType);
final String parserError = TextParser.supportsMime(mimeType);
if (parserError != null) {
return "Media_Content, no parser: " + parserError;
}
@ -585,9 +585,9 @@ public class Response {
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
final Date ifModifiedSince = this.requestHeader.ifModifiedSince();
if ((ifModifiedSince != null) && (responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
// parse date
Date d = responseHeader.lastModified();
Date d = this.responseHeader.lastModified();
if (d == null) {
d = new Date(GenericFormatter.correctedUTCTime());
}
@ -599,8 +599,8 @@ public class Response {
}
// -pragma in cached response
if (responseHeader.containsKey(HeaderFramework.PRAGMA) &&
(responseHeader.get(HeaderFramework.PRAGMA)).toUpperCase().equals("NO-CACHE")) {
if (this.responseHeader.containsKey(HeaderFramework.PRAGMA) &&
(this.responseHeader.get(HeaderFramework.PRAGMA)).toUpperCase().equals("NO-CACHE")) {
return "Denied_(pragma_no_cache)";
}
@ -613,7 +613,7 @@ public class Response {
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
final Date expires = responseHeader.expires();
final Date expires = this.responseHeader.expires();
if (expires != null && expires.before(new Date(GenericFormatter.correctedUTCTime()))) {
return "Stale_(Expired)";
}
@ -624,7 +624,7 @@ public class Response {
// -cache-control in cached response
// the cache-control has many value options.
String cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL);
String cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
/* we have the following cases for cache-control:
@ -641,7 +641,7 @@ public class Response {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
final Date date = responseHeader.date();
final Date date = this.responseHeader.date();
if (date == null) {
return "Stale_(no_date_given_in_response)";
}
@ -675,7 +675,7 @@ public class Response {
// check profile
if (!profile().indexText() && !profile().indexMedia()) {
return "indexing not allowed - indexText and indexMedia not set (for crawler = " + profile.name() + ")";
return "indexing not allowed - indexText and indexMedia not set (for crawler = " + this.profile.name() + ")";
}
// -CGI access in request
@ -692,9 +692,9 @@ public class Response {
// we checked that in shallStoreCache
// check if document can be indexed
if (responseHeader != null) {
final String mimeType = responseHeader.mime();
String parserError = TextParser.supportsMime(mimeType);
if (this.responseHeader != null) {
final String mimeType = this.responseHeader.mime();
final String parserError = TextParser.supportsMime(mimeType);
if (parserError != null && TextParser.supportsExtension(url()) != null) return "no parser available: " + parserError;
}
/*
@ -703,7 +703,7 @@ public class Response {
return "Media_Content_(forbidden)";
}
*/
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
// -> this does not apply for the crawler
@ -739,36 +739,36 @@ public class Response {
return null;
}
public String getMimeType() {
if (responseHeader == null) return null;
String mimeType = responseHeader.mime();
if (this.responseHeader == null) return null;
String mimeType = this.responseHeader.mime();
mimeType = mimeType.trim().toLowerCase();
final int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
}
public String getCharacterEncoding() {
if (responseHeader == null) return null;
return responseHeader.getCharacterEncoding();
if (this.responseHeader == null) return null;
return this.responseHeader.getCharacterEncoding();
}
public DigestURI referrerURL() {
if (requestHeader == null) return null;
if (this.requestHeader == null) return null;
try {
String r = requestHeader.get(RequestHeader.REFERER, null);
final String r = this.requestHeader.get(RequestHeader.REFERER, null);
if (r == null) return null;
return new DigestURI(r);
} catch (final Exception e) {
return null;
}
}
public byte[] referrerHash() {
if (requestHeader == null) return null;
String u = requestHeader.get(RequestHeader.REFERER, "");
if (this.requestHeader == null) return null;
final String u = this.requestHeader.get(RequestHeader.REFERER, "");
if (u == null || u.length() == 0) return null;
try {
return new DigestURI(u).hash();
@ -776,27 +776,27 @@ public class Response {
return null;
}
}
public boolean validResponseStatus() {
return (responseStatus == null) ? false : responseStatus.startsWith("200") || responseStatus.startsWith("203");
return (this.responseStatus == null) ? false : this.responseStatus.startsWith("200") || this.responseStatus.startsWith("203");
}
public Date ifModifiedSince() {
return (requestHeader == null) ? null : requestHeader.ifModifiedSince();
return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince();
}
public boolean requestWithCookie() {
return (requestHeader == null) ? false : requestHeader.containsKey(RequestHeader.COOKIE);
return (this.requestHeader == null) ? false : this.requestHeader.containsKey(RequestHeader.COOKIE);
}
public boolean requestProhibitsIndexing() {
return (requestHeader == null)
? false
: requestHeader.containsKey(HeaderFramework.X_YACY_INDEX_CONTROL) &&
(requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
return (this.requestHeader == null)
? false
: this.requestHeader.containsKey(HeaderFramework.X_YACY_INDEX_CONTROL) &&
(this.requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
}
public EventOrigin processCase(String mySeedHash) {
public EventOrigin processCase(final String mySeedHash) {
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
@ -818,13 +818,13 @@ public class Response {
}
return processCase;
}
public Document[] parse() throws Parser.Failure {
String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content);
} catch (Exception e) {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content, false);
} catch (final Exception e) {
return null;
}

@ -35,7 +35,6 @@ import java.util.Date;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -53,26 +52,26 @@ import net.yacy.kelondro.logging.Log;
*
*/
public class DocumentIndex extends Segment {
private static final RankingProfile textRankingDefault = new RankingProfile(ContentDomain.TEXT);
//private Bitfield zeroConstraint = new Bitfield(4);
private static DigestURI poison;
static {
try {
poison = new DigestURI("file://.");
} catch (MalformedURLException e) {}
} catch (final MalformedURLException e) {}
}
BlockingQueue<DigestURI> queue; // a queue of document ID's
private Worker[] worker;
private final Worker[] worker;
CallbackListener callback;
static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup");
public DocumentIndex(final File segmentPath, CallbackListener callback, int cachesize) throws IOException {
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) throws IOException {
super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false);
int cores = Runtime.getRuntime().availableProcessors() + 1;
final int cores = Runtime.getRuntime().availableProcessors() + 1;
this.callback = callback;
this.queue = new LinkedBlockingQueue<DigestURI>(cores * 300);
this.worker = new Worker[cores];
@ -81,46 +80,48 @@ public class DocumentIndex extends Segment {
this.worker[i].start();
}
}
class Worker extends Thread {
public Worker(int count) {
public Worker(final int count) {
super(workerThreadGroup, "query-" + count);
}
@Override
public void run() {
DigestURI f;
URIMetadataRow resultRow;
URIMetadataRow[] resultRows;
try {
while ((f = queue.take()) != poison) try {
resultRow = add(f);
if (callback != null) {
if (resultRow == null) {
callback.fail(f, "result is null");
} else {
callback.commit(f, resultRow);
while ((f = DocumentIndex.this.queue.take()) != poison) try {
resultRows = add(f);
for (final URIMetadataRow resultRow: resultRows) {
if (DocumentIndex.this.callback != null) {
if (resultRow == null) {
DocumentIndex.this.callback.fail(f, "result is null");
} else {
DocumentIndex.this.callback.commit(f, resultRow);
}
}
}
} catch (IOException e) {
} catch (final IOException e) {
if (e.getMessage().indexOf("cannot parse") < 0) Log.logException(e);
callback.fail(f, e.getMessage());
DocumentIndex.this.callback.fail(f, e.getMessage());
}
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
}
/**
* get the number of pending documents in the indexing queue
*/
public int pending() {
return this.queue.size();
}
public void clearQueue() {
this.queue.clear();
}
private URIMetadataRow add(DigestURI url) throws IOException {
private URIMetadataRow[] add(final DigestURI url) throws IOException {
if (url == null) throw new IOException("file = null");
if (url.isDirectory()) throw new IOException("file should be a document, not a path");
if (!url.canRead()) throw new IOException("cannot read file");
@ -128,17 +129,20 @@ public class DocumentIndex extends Segment {
long length;
try {
length = url.length();
} catch (Exception e) {
} catch (final Exception e) {
length = -1;
}
try {
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1));
} catch (Exception e) {
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1), true);
} catch (final Exception e) {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
}
Document document = Document.mergeDocuments(url, null, documents);
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
return super.storeDocument(
//Document document = Document.mergeDocuments(url, null, documents);
final URIMetadataRow[] rows = new URIMetadataRow[documents.length];
int c = 0;
for (final Document document: documents) {
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
rows[c++] = super.storeDocument(
url,
null,
new Date(url.lastModified()),
@ -149,25 +153,27 @@ public class DocumentIndex extends Segment {
null,
DocumentIndex.class.getName() + ".add"
);
}
return rows;
}
/**
* add a file or a directory of files to the index
* If the given file is a path to a directory, the complete sub-tree is indexed
* @param start
*/
public void addConcurrent(DigestURI start) throws IOException {
public void addConcurrent(final DigestURI start) throws IOException {
assert (start != null);
assert (start.canRead()) : start.toString();
if (!start.isDirectory()) {
try {
this.queue.put(start);
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
return;
}
String[] s = start.list();
final String[] s = start.list();
DigestURI w;
for (String t: s) {
for (final String t: s) {
try {
w = new DigestURI(start, t);
if (w.canRead() && !w.isHidden()) {
@ -176,31 +182,31 @@ public class DocumentIndex extends Segment {
} else {
try {
this.queue.put(w);
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
}
} catch (MalformedURLException e1) {
} catch (final MalformedURLException e1) {
Log.logException(e1);
}
}
}
/**
* do a full-text search of a given string and return a specific number of results
* @param querystring
* @param count
* @return a list of files that contain the given string
*/
public ArrayList<DigestURI> find(String querystring, int count) {
*/
public ArrayList<DigestURI> find(final String querystring, int count) {
// make a query and start a search
QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex");
ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang));
RankingProcess rankedCache = new RankingProcess(query, order, SearchEvent.max_results_preparation);
final QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex");
final ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang));
final RankingProcess rankedCache = new RankingProcess(query, order, SearchEvent.max_results_preparation);
rankedCache.start();
// search is running; retrieve results
URIMetadataRow row;
ArrayList<DigestURI> files = new ArrayList<DigestURI>();
final ArrayList<DigestURI> files = new ArrayList<DigestURI>();
Components metadata;
while ((row = rankedCache.takeURL(false, 1000)) != null) {
metadata = row.metadata();
@ -211,7 +217,7 @@ public class DocumentIndex extends Segment {
}
return files;
}
/**
* close the index.
* This terminates all worker threads and then closes the segment.
@ -219,27 +225,27 @@ public class DocumentIndex extends Segment {
@Override
public void close() {
// send termination signal to worker threads
for (int i = 0; i < this.worker.length; i++) {
for (final Worker element : this.worker) {
try {
this.queue.put(poison);
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
// wait for termination
for (int i = 0; i < this.worker.length; i++) {
for (final Worker element : this.worker) {
try {
this.worker[i].join();
} catch (InterruptedException e) {}
element.join();
} catch (final InterruptedException e) {}
}
// close the segment
super.close();
}
public interface CallbackListener {
public void commit(DigestURI f, URIMetadataRow resultRow);
public void fail(DigestURI f, String failReason);
}
public static void main(String[] args) {
public static void main(final String[] args) {
// first argument: path to segment
// second argument: either 'add' or 'search'
// third and more arguments exists only in case that second argument is 'search': these are then the search words
@ -249,37 +255,37 @@ public class DocumentIndex extends Segment {
// DocumentIndex yacyindex search steht
System.setProperty("java.awt.headless", "true");
if (args.length < 3) return;
File segmentPath = new File(args[0]);
final File segmentPath = new File(args[0]);
System.out.println("using index files at " + segmentPath.getAbsolutePath());
CallbackListener callback = new CallbackListener() {
public void commit(DigestURI f, URIMetadataRow resultRow) {
final CallbackListener callback = new CallbackListener() {
public void commit(final DigestURI f, final URIMetadataRow resultRow) {
System.out.println("indexed: " + f.toString());
}
public void fail(DigestURI f, String failReason) {
public void fail(final DigestURI f, final String failReason) {
System.out.println("not indexed " + f.toString() + ": " + failReason);
}
};
try {
if (args[1].equals("add")) {
DigestURI f = new DigestURI(args[2]);
DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
final DigestURI f = new DigestURI(args[2]);
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
di.addConcurrent(f);
di.close();
} else {
String query = "";
for (int i = 2; i < args.length; i++) query += args[i];
query.trim();
DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
ArrayList<DigestURI> results = di.find(query, 100);
for (DigestURI f: results) {
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
final ArrayList<DigestURI> results = di.find(query, 100);
for (final DigestURI f: results) {
if (f != null) System.out.println(f.toString());
}
di.close();
}
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
//System.exit(0);
}
}

@ -1921,7 +1921,7 @@ public final class Switchboard extends serverSwitch {
assert response.getContent() != null;
try {
// parse the document
documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent());
documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), getConfigBool("crawler.embedLinksAsDocuments", false));
if (documents == null) {
throw new Parser.Failure("Parser returned null.", response.url());
}

@ -7,12 +7,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -29,27 +29,27 @@ import java.util.Set;
import net.yacy.kelondro.logging.Log;
public abstract class AbstractParser implements Parser {
protected final Log log = new Log("PARSER");
protected final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
private final String name;
/**
* initialize a parser with a name
* @param name
*/
public AbstractParser(String name) {
public AbstractParser(final String name) {
this.name = name;
}
/**
* return the name of the parser
*/
public String getName() {
return this.name;
}
/**
* each parser must define a set of supported mime types
* @return a set of mime type strings that are supported
@ -57,7 +57,7 @@ public abstract class AbstractParser implements Parser {
public Set<String> supportedMimeTypes() {
return this.SUPPORTED_MIME_TYPES;
}
/**
* each parser must define a set of supported file extensions
* @return a set of file name extensions that are supported
@ -65,22 +65,22 @@ public abstract class AbstractParser implements Parser {
public Set<String> supportedExtensions() {
return this.SUPPORTED_EXTENSIONS;
}
/**
* check equivalence of parsers; this simply tests equality of parser names
* @param o
* @return
*/
public boolean equals(Object o) {
return this.getName().equals(((Parser) o).getName());
public boolean equals(final Object o) {
return getName().equals(((Parser) o).getName());
}
/**
* the hash code of a parser
* @return the hash code of the parser name string
*/
public int hashCode() {
return this.getName().hashCode();
return getName().hashCode();
}
}

@ -131,6 +131,10 @@ public class Document {
return this.parserObject;
}
public Set<String> getContentLanguages() {
return this.languages;
}
/**
* compute a set of languages that this document contains
* the language is not computed using a statistical analysis of the content, only from given metadata that came with the document

@ -31,6 +31,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@ -58,6 +59,7 @@ import net.yacy.document.parser.vcfParser;
import net.yacy.document.parser.vsdParser;
import net.yacy.document.parser.xlsParser;
import net.yacy.document.parser.zipParser;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -141,7 +143,8 @@ public final class TextParser {
final MultiProtocolURI location,
final String mimeType,
final String charset,
final File sourceFile
final File sourceFile,
final boolean multipleVirtualDocs
) throws InterruptedException, Parser.Failure {
BufferedInputStream sourceStream = null;
@ -154,7 +157,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location);
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream, multipleVirtualDocs);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -164,6 +167,7 @@ public final class TextParser {
if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {}
}
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
}
@ -171,7 +175,8 @@ public final class TextParser {
final MultiProtocolURI location,
String mimeType,
final String charset,
final byte[] content
final byte[] content,
final boolean multipleVirtualDocs
) throws Parser.Failure {
if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
mimeType = normalizeMimeType(mimeType);
@ -185,7 +190,12 @@ public final class TextParser {
}
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true, false);
return parseSource(location, mimeType, idioms, charset, content);
Document[] docs = parseSource(location, mimeType, idioms, charset, content);
// finally enrich the docs set with virtual docs from the enclosed documents
if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]);
return docs;
}
public static Document[] parseSource(
@ -193,7 +203,8 @@ public final class TextParser {
String mimeType,
final String charset,
final long contentLength,
final InputStream sourceStream
final InputStream sourceStream,
final boolean multipleVirtualDocs
) throws Parser.Failure {
if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType);
@ -222,7 +233,12 @@ public final class TextParser {
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
return parseSource(location, mimeType, idioms, charset, b);
Document[] docs = parseSource(location, mimeType, idioms, charset, b);
// finally enrich the docs set with virtual docs from the enclosed documents
if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]);
return docs;
}
private static Document[] parseSource(
@ -292,6 +308,7 @@ public final class TextParser {
}
}
for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
return docs;
}
@ -429,4 +446,73 @@ public final class TextParser {
if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v);
}
/**
* produce virtual documents for each of the link that is contained in the document
* @param document
* @return
*/
public static Document[] virtualDocs(final Document document) {
final ArrayList<Document> docs = new ArrayList<Document>();
docs.add(document);
for (final Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
docs.add(genLinkDocs(docs, "application", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
docs.add(genLinkDocs(docs, "audio", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
docs.add(genLinkDocs(docs, "video", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
docs.add(genImageDocs(docs, link.getValue()));
}
// finally return the list of documents
return docs.toArray(new Document[docs.size()]);
}
private final static Document genLinkDocs(final ArrayList<Document> docs, final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
return new Document(
uri,
Classification.ext2mime(uri.getFileExtension()),
"UTF-8",
null,
contentLanguages,
null,
descr,
"",
"",
new String[]{descr},
type,
0.0f, 0.0f,
uri.toNormalform(false, false),
null,
null,
null,
false);
}
private final static Document genImageDocs(final ArrayList<Document> docs, final ImageEntry img) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
return new Document(
img.url(),
Classification.ext2mime(img.url().getFileExtension()),
"UTF-8",
null,
null,
null,
img.alt(),
"",
"",
new String[]{img.alt()},
"image",
0.0f, 0.0f,
img.url().toNormalform(false, false),
null,
null,
null,
false);
}
}

@ -2,19 +2,19 @@
* MediawikiImporter
* Copyright 2008 by Michael Peter Christen
* First released 20.11.2008 at http://yacy.net
*
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -22,17 +22,6 @@
package net.yacy.document.importer;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.content.SurrogateReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
import org.apache.tools.bzip2.CBZip2InputStream;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
@ -61,6 +50,17 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.content.SurrogateReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
import org.apache.tools.bzip2.CBZip2InputStream;
import de.anomic.data.wiki.WikiCode;
import de.anomic.data.wiki.WikiParser;
@ -78,9 +78,9 @@ public class MediawikiImporter extends Thread implements Importer {
private static final byte[] pagestartb = UTF8.getBytes(pagestart);
private static final byte[] pageendb = UTF8.getBytes(pageend);
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 mediawiki dump
public static Importer job; // if started from a servlet, this object is used to store the thread
public File sourcefile;
public File targetdir;
public int count;
@ -88,100 +88,100 @@ public class MediawikiImporter extends Thread implements Importer {
private final long docsize;
private final int approxdocs;
private String hostport, urlStub;
public MediawikiImporter(File sourcefile, File targetdir) {
public MediawikiImporter(final File sourcefile, final File targetdir) {
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir;
this.count = 0;
this.start = 0;
this.hostport = null;
this.urlStub = null;
}
public int count() {
return this.count;
}
public String source() {
return this.sourcefile.getAbsolutePath();
}
public String status() {
return "";
}
/**
* return the number of articles per second
* @return
*/
public int speed() {
if (count == 0) return 0;
return (int) ((long) count / Math.max(1L, runningTime() ));
if (this.count == 0) return 0;
return (int) (this.count / Math.max(1L, runningTime() ));
}
/**
* return the remaining seconds for the completion of all records in milliseconds
* @return
*/
public long remainingTime() {
return Math.max(0, this.approxdocs - count) / Math.max(1, speed() );
return Math.max(0, this.approxdocs - this.count) / Math.max(1, speed() );
}
public long runningTime() {
return (System.currentTimeMillis() - start) / 1000L;
return (System.currentTimeMillis() - this.start) / 1000L;
}
public void run() {
this.start = System.currentTimeMillis();
try {
String targetstub = sourcefile.getName();
String targetstub = this.sourcefile.getName();
int p = targetstub.lastIndexOf("\\.");
if (p > 0) targetstub = targetstub.substring(0, p);
InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1024 * 1024);
if (sourcefile.getName().endsWith(".bz2")) {
InputStream is = new BufferedInputStream(new FileInputStream(this.sourcefile), 1024 * 1024);
if (this.sourcefile.getName().endsWith(".bz2")) {
int b = is.read();
if (b != 'B') throw new IOException("Invalid bz2 content.");
b = is.read();
if (b != 'Z') throw new IOException("Invalid bz2 content.");
is = new CBZip2InputStream(is);
} else if (sourcefile.getName().endsWith(".gz")) {
} else if (this.sourcefile.getName().endsWith(".gz")) {
is = new GZIPInputStream(is);
}
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
final BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
String t;
StringBuilder sb = new StringBuilder();
boolean page = false, text = false;
String title = null;
wikiparserrecord poison = newRecord();
int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
ExecutorService service = Executors.newFixedThreadPool(threads + 1);
convertConsumer[] consumers = new convertConsumer[threads];
Future<?>[] consumerResults = new Future[threads];
final wikiparserrecord poison = newRecord();
final int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
final BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
final BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
final ExecutorService service = Executors.newFixedThreadPool(threads + 1);
final convertConsumer[] consumers = new convertConsumer[threads];
final Future<?>[] consumerResults = new Future[threads];
for (int i = 0; i < threads; i++) {
consumers[i] = new convertConsumer(in, out, poison);
consumerResults[i] = service.submit(consumers[i]);
}
convertWriter writer = new convertWriter(out, poison, targetdir, targetstub);
Future<Integer> writerResult = service.submit(writer);
final convertWriter writer = new convertWriter(out, poison, this.targetdir, targetstub);
final Future<Integer> writerResult = service.submit(writer);
wikiparserrecord record;
int q;
while ((t = r.readLine()) != null) {
if ((p = t.indexOf("<base>")) >= 0 && (q = t.indexOf("</base>", p)) > 0) {
//urlStub = "http://" + lang + ".wikipedia.org/wiki/";
urlStub = t.substring(p + 6, q);
if (!urlStub.endsWith("/")) {
q = urlStub.lastIndexOf('/');
if (q > 0) urlStub = urlStub.substring(0, q + 1);
this.urlStub = t.substring(p + 6, q);
if (!this.urlStub.endsWith("/")) {
q = this.urlStub.lastIndexOf('/');
if (q > 0) this.urlStub = this.urlStub.substring(0, q + 1);
}
DigestURI uri = new DigestURI(urlStub);
hostport = uri.getHost();
if (uri.getPort() != 80) hostport += ":" + uri.getPort();
final DigestURI uri = new DigestURI(this.urlStub);
this.hostport = uri.getHost();
if (uri.getPort() != 80) this.hostport += ":" + uri.getPort();
continue;
}
if (t.indexOf(pagestart) >= 0) {
@ -192,7 +192,7 @@ public class MediawikiImporter extends Thread implements Importer {
text = page;
q = t.indexOf('>', p + textstart.length());
if (q > 0) {
int u = t.indexOf(textend, q + 1);
final int u = t.indexOf(textend, q + 1);
if (u > q) {
sb.append(t.substring(q + 1, u));
Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title);
@ -200,11 +200,11 @@ public class MediawikiImporter extends Thread implements Importer {
Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
continue;
}
record = newRecord(hostport, urlStub, title, sb);
record = newRecord(this.hostport, this.urlStub, title, sb);
try {
in.put(record);
this.count++;
} catch (InterruptedException e1) {
} catch (final InterruptedException e1) {
Log.logException(e1);
}
sb = new StringBuilder(200);
@ -222,11 +222,11 @@ public class MediawikiImporter extends Thread implements Importer {
Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
continue;
}
record = newRecord(hostport, urlStub, title, sb);
record = newRecord(this.hostport, this.urlStub, title, sb);
try {
in.put(record);
this.count++;
} catch (InterruptedException e1) {
} catch (final InterruptedException e1) {
Log.logException(e1);
}
sb = new StringBuilder(200);
@ -248,7 +248,7 @@ public class MediawikiImporter extends Thread implements Importer {
}
}
r.close();
try {
for (int i = 0; i < threads; i++) {
in.put(poison);
@ -258,35 +258,35 @@ public class MediawikiImporter extends Thread implements Importer {
}
out.put(poison);
writerResult.get(10000, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
} catch (ExecutionException e) {
} catch (final ExecutionException e) {
Log.logException(e);
} catch (TimeoutException e) {
} catch (final TimeoutException e) {
Log.logException(e);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
}
public static void checkIndex(File mediawikixml) {
File idx = idxFromMediawikiXML(mediawikixml);
public static void checkIndex(final File mediawikixml) {
final File idx = idxFromMediawikiXML(mediawikixml);
if (idx.exists()) return;
new indexMaker(mediawikixml).start();
}
public static class indexMaker extends Thread {
File mediawikixml;
public indexMaker(File mediawikixml) {
public indexMaker(final File mediawikixml) {
this.mediawikixml = mediawikixml;
}
public void run() {
try {
createIndex(this.mediawikixml);
@ -296,24 +296,24 @@ public class MediawikiImporter extends Thread implements Importer {
}
}
}
public static File idxFromMediawikiXML(File mediawikixml) {
public static File idxFromMediawikiXML(final File mediawikixml) {
return new File(mediawikixml.getAbsolutePath() + ".idx.xml");
}
public static void createIndex(File dumpFile) throws IOException {
public static void createIndex(final File dumpFile) throws IOException {
// calculate md5
//String md5 = serverCodings.encodeMD5Hex(dumpFile);
// init reader, producer and consumer
PositionAwareReader in = new PositionAwareReader(dumpFile);
indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile));
wikiConsumer consumer = new wikiConsumer(100, producer);
ExecutorService service = Executors.newFixedThreadPool(2);
Future<Integer> producerResult = service.submit(consumer);
Future<Integer> consumerResult = service.submit(producer);
final PositionAwareReader in = new PositionAwareReader(dumpFile);
final indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile));
final wikiConsumer consumer = new wikiConsumer(100, producer);
final ExecutorService service = Executors.newFixedThreadPool(2);
final Future<Integer> producerResult = service.submit(consumer);
final Future<Integer> consumerResult = service.submit(producer);
service.shutdown();
// read the wiki dump
long start, stop;
while (in.seek(pagestartb)) {
@ -324,18 +324,18 @@ public class MediawikiImporter extends Thread implements Importer {
consumer.consume(new wikiraw(in.bytes(), start, stop));
in.resetBuffer();
}
// shut down the services
try {
consumer.consume(wikiConsumer.poison);
try {consumerResult.get(5000, TimeUnit.MILLISECONDS);} catch (TimeoutException e) {}
try {consumerResult.get(5000, TimeUnit.MILLISECONDS);} catch (final TimeoutException e) {}
producer.consume(indexProducer.poison);
if (!consumerResult.isDone()) consumerResult.get();
producerResult.get();
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
return;
} catch (ExecutionException e) {
} catch (final ExecutionException e) {
Log.logException(e);
return;
}
@ -348,120 +348,120 @@ public class MediawikiImporter extends Thread implements Importer {
PrintWriter out;
protected static wikisourcerecord poison = new wikisourcerecord("", 0, 0);
int count;
public indexProducer(int bufferCount, File indexFile) throws IOException {
entries = new ArrayBlockingQueue<wikisourcerecord>(bufferCount);
out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile)));
count = 0;
out.println("<index>");
}
public void consume(wikisourcerecord b) {
public indexProducer(final int bufferCount, final File indexFile) throws IOException {
this.entries = new ArrayBlockingQueue<wikisourcerecord>(bufferCount);
this.out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile)));
this.count = 0;
this.out.println("<index>");
}
public void consume(final wikisourcerecord b) {
try {
entries.put(b);
} catch (InterruptedException e) {
this.entries.put(b);
} catch (final InterruptedException e) {
Log.logException(e);
}
}
public Integer call() {
wikisourcerecord r;
try {
while(true) {
r = entries.take();
r = this.entries.take();
if (r == poison) {
Log.logInfo("WIKITRANSLATION", "producer / got poison");
break;
}
out.println(" <page start=\"" + r.start + "\" length=\"" + (r.end - r.start) + "\">");
out.println(" <title>" + r.title + "</title>");
out.println(" </page>");
this.out.println(" <page start=\"" + r.start + "\" length=\"" + (r.end - r.start) + "\">");
this.out.println(" <title>" + r.title + "</title>");
this.out.println(" </page>");
Log.logInfo("WIKITRANSLATION", "producer / record start: " + r.start + ", title : " + r.title);
count++;
this.count++;
}
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
entries.clear();
out.println("</index>");
out.close();
return Integer.valueOf(count);
this.entries.clear();
this.out.println("</index>");
this.out.close();
return Integer.valueOf(this.count);
}
}
private static class wikiConsumer implements Callable<Integer> {
private final BlockingQueue<wikiraw> entries;
protected static wikiraw poison = new wikiraw(new byte[0], 0, 0);
private final indexProducer producer;
private int count;
public wikiConsumer(int bufferCount, indexProducer producer) {
entries = new ArrayBlockingQueue<wikiraw>(bufferCount);
public wikiConsumer(final int bufferCount, final indexProducer producer) {
this.entries = new ArrayBlockingQueue<wikiraw>(bufferCount);
this.producer = producer;
count = 0;
this.count = 0;
}
public void consume(wikiraw b) {
public void consume(final wikiraw b) {
try {
entries.put(b);
} catch (InterruptedException e) {
this.entries.put(b);
} catch (final InterruptedException e) {
Log.logException(e);
}
}
public Integer call() {
wikisourcerecord r;
wikiraw c;
try {
while(true) {
c = entries.take();
c = this.entries.take();
if (c == poison) {
Log.logInfo("WIKITRANSLATION", "consumer / got poison");
break;
}
try {
r = new wikisourcerecord(c.b, c.start, c.end);
producer.consume(r);
this.producer.consume(r);
Log.logInfo("WIKITRANSLATION", "consumer / record start: " + r.start + ", title : " + r.title);
count++;
} catch (RuntimeException e) {}
this.count++;
} catch (final RuntimeException e) {}
}
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
entries.clear();
return Integer.valueOf(count);
this.entries.clear();
return Integer.valueOf(this.count);
}
}
private static class wikiraw {
public long start, end;
public byte[] b;
public wikiraw(byte[] b, long start, long end) {
public wikiraw(final byte[] b, final long start, final long end) {
this.b = b;
this.start = start;
this.end = end;
}
}
public static class wikisourcerecord {
public long start, end;
public String title;
public wikisourcerecord(String title, long start, long end) {
public wikisourcerecord(final String title, final long start, final long end) {
this.title = title;
this.start = start;
this.end = end;
}
public wikisourcerecord(byte[] chunk, long start, long end) {
public wikisourcerecord(final byte[] chunk, final long start, final long end) {
String s;
s = UTF8.String(chunk);
int t0 = s.indexOf("<title>");
final int t0 = s.indexOf("<title>");
if (t0 >= 0) {
int t1 = s.indexOf("</title>", t0);
final int t1 = s.indexOf("</title>", t0);
if (t1 >= 0) {
this.title = s.substring(t0 + 7, t1);
} else {
@ -470,7 +470,7 @@ public class MediawikiImporter extends Thread implements Importer {
} else {
throw new RuntimeException("no title start in record");
}
this.start = start;
this.end = end;
}
@ -478,16 +478,16 @@ public class MediawikiImporter extends Thread implements Importer {
public wikiparserrecord newRecord() {
return new wikiparserrecord(null, null, null, null);
}
public wikiparserrecord newRecord(String hostport, String urlStub, String title, StringBuilder sb) {
public wikiparserrecord newRecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) {
return new wikiparserrecord(hostport, urlStub, title, sb);
}
public class wikiparserrecord {
public String title;
String source, html, hostport, urlStub;
DigestURI url;
Document document;
public wikiparserrecord(String hostport, String urlStub, String title, StringBuilder sb) {
public wikiparserrecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) {
this.title = title;
this.hostport = hostport;
this.urlStub = urlStub;
@ -495,97 +495,97 @@ public class MediawikiImporter extends Thread implements Importer {
}
public void genHTML() throws IOException {
try {
WikiParser wparser = new WikiCode();
html = wparser.transform(hostport, source);
} catch (Exception e) {
final WikiParser wparser = new WikiCode();
this.html = wparser.transform(this.hostport, this.source);
} catch (final Exception e) {
Log.logException(e);
throw new IOException(e.getMessage());
}
}
public void genDocument() throws Parser.Failure {
try {
url = new DigestURI(urlStub + title);
Document[] parsed = TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html));
document = Document.mergeDocuments(url, "text/html", parsed);
this.url = new DigestURI(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html), false);
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
document.setTitle(title);
} catch (MalformedURLException e1) {
this.document.setTitle(this.title);
} catch (final MalformedURLException e1) {
Log.logException(e1);
}
}
public void writeXML(OutputStreamWriter os) throws IOException {
document.writeXML(os, new Date());
public void writeXML(final OutputStreamWriter os) throws IOException {
this.document.writeXML(os, new Date());
}
}
private static class PositionAwareReader {
private final InputStream is;
private long seekpos;
private ByteBuffer bb;
public PositionAwareReader(File dumpFile) throws FileNotFoundException {
public PositionAwareReader(final File dumpFile) throws FileNotFoundException {
this.is = new BufferedInputStream(new FileInputStream(dumpFile), 64 *1024);
this.seekpos = 0;
this.bb = new ByteBuffer();
}
public void resetBuffer() {
if (bb.length() > 10 * 1024) bb = new ByteBuffer(); else bb.clear();
if (this.bb.length() > 10 * 1024) this.bb = new ByteBuffer(); else this.bb.clear();
}
public boolean seek(byte[] pattern) throws IOException {
public boolean seek(final byte[] pattern) throws IOException {
int pp = 0;
int c;
while ((c = is.read()) >= 0) {
seekpos++;
bb.append(c);
while ((c = this.is.read()) >= 0) {
this.seekpos++;
this.bb.append(c);
if (pattern[pp] == c) pp++; else pp = 0;
if (pp == pattern.length) return true;
}
return false;
}
public long pos() {
return seekpos;
return this.seekpos;
}
public byte[] bytes() {
return bb.getBytes();
return this.bb.getBytes();
}
public void close() {
try {
is.close();
} catch (IOException e) {
this.is.close();
} catch (final IOException e) {
Log.logException(e);
}
}
}
public static byte[] read(File f, long start, int len) {
byte[] b = new byte[len];
public static byte[] read(final File f, final long start, final int len) {
final byte[] b = new byte[len];
RandomAccessFile raf = null;
try {
raf = new RandomAccessFile(f, "r");
raf.seek(start);
raf.read(b);
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
return null;
} finally {
if (raf != null) try {
raf.close();
try{raf.getChannel().close();} catch (IOException e) {}
} catch (IOException e) { }
try{raf.getChannel().close();} catch (final IOException e) {}
} catch (final IOException e) { }
}
return b;
}
public static wikisourcerecord find(String title, File f) throws IOException {
PositionAwareReader in = new PositionAwareReader(f);
public static wikisourcerecord find(final String title, final File f) throws IOException {
final PositionAwareReader in = new PositionAwareReader(f);
long start;
String m = "<title>" + title + "</title>";
final String m = "<title>" + title + "</title>";
String s;
while (in.seek(UTF8.getBytes("<page "))) {
start = in.pos() - 6;
@ -607,56 +607,56 @@ public class MediawikiImporter extends Thread implements Importer {
p += 8;
q = s.indexOf('"', p + 1);
if (q < 0) return null;
int length = Integer.parseInt(s.substring(p, q));
final int length = Integer.parseInt(s.substring(p, q));
//Log.logInfo("WIKITRANSLATION", "start = " + start + ", length = " + length);
return new wikisourcerecord(title, start, start + length);
}
}
return null;
}
private static class convertConsumer implements Callable<Integer> {
private final BlockingQueue<wikiparserrecord> in, out;
private final wikiparserrecord poison;
public convertConsumer(BlockingQueue<wikiparserrecord> in, BlockingQueue<wikiparserrecord> out, wikiparserrecord poison) {
public convertConsumer(final BlockingQueue<wikiparserrecord> in, final BlockingQueue<wikiparserrecord> out, final wikiparserrecord poison) {
this.poison = poison;
this.in = in;
this.out = out;
}
public Integer call() {
wikiparserrecord record;
try {
while(true) {
record = in.take();
if (record == poison) {
record = this.in.take();
if (record == this.poison) {
Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison");
break;
}
try {
record.genHTML();
record.genDocument();
out.put(record);
} catch (RuntimeException e) {
this.out.put(record);
} catch (final RuntimeException e) {
Log.logException(e);
} catch (Parser.Failure e) {
} catch (final Parser.Failure e) {
Log.logException(e);
} catch (IOException e) {
} catch (final IOException e) {
// TODO Auto-generated catch block
Log.logException(e);
}
}
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
Log.logInfo("WIKITRANSLATION", "*** convertConsumer has terminated");
return Integer.valueOf(0);
}
}
private static class convertWriter implements Callable<Integer> {
private final BlockingQueue<wikiparserrecord> in;
@ -666,12 +666,12 @@ public class MediawikiImporter extends Thread implements Importer {
private final File targetdir;
private int fc, rc;
private String outputfilename;
public convertWriter(
BlockingQueue<wikiparserrecord> in,
wikiparserrecord poison,
File targetdir,
String targetstub) {
final BlockingQueue<wikiparserrecord> in,
final wikiparserrecord poison,
final File targetdir,
final String targetstub) {
this.poison = poison;
this.in = in;
this.osw = null;
@ -681,63 +681,63 @@ public class MediawikiImporter extends Thread implements Importer {
this.rc = 0;
this.outputfilename = null;
}
public Integer call() {
wikiparserrecord record;
try {
while(true) {
record = in.take();
if (record == poison) {
record = this.in.take();
if (record == this.poison) {
Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison");
break;
}
if (osw == null) {
if (this.osw == null) {
// start writing a new file
this.outputfilename = targetstub + "." + fc + ".xml.prt";
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n");
this.outputfilename = this.targetstub + "." + this.fc + ".xml.prt";
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(this.targetdir, this.outputfilename))), "UTF-8");
this.osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n");
}
Log.logInfo("WIKITRANSLATION", "[CONSUME] Title: " + record.title);
record.document.writeXML(osw, new Date());
rc++;
if (rc >= 10000) {
osw.write("</surrogates>\n");
osw.close();
String finalfilename = targetstub + "." + fc + ".xml";
new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename));
rc = 0;
fc++;
outputfilename = targetstub + "." + fc + ".xml.prt";
osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n");
record.document.writeXML(this.osw, new Date());
this.rc++;
if (this.rc >= 10000) {
this.osw.write("</surrogates>\n");
this.osw.close();
final String finalfilename = this.targetstub + "." + this.fc + ".xml";
new File(this.targetdir, this.outputfilename).renameTo(new File(this.targetdir, finalfilename));
this.rc = 0;
this.fc++;
this.outputfilename = this.targetstub + "." + this.fc + ".xml.prt";
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(this.targetdir, this.outputfilename))), "UTF-8");
this.osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n");
}
}
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
} catch (UnsupportedEncodingException e) {
} catch (final UnsupportedEncodingException e) {
Log.logException(e);
} catch (FileNotFoundException e) {
} catch (final FileNotFoundException e) {
Log.logException(e);
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
} finally {
try {
osw.write(SurrogateReader.SURROGATES_MAIN_ELEMENT_CLOSE + "\n");
osw.close();
String finalfilename = targetstub + "." + fc + ".xml";
new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename));
} catch (IOException e) {
this.osw.write(SurrogateReader.SURROGATES_MAIN_ELEMENT_CLOSE + "\n");
this.osw.close();
final String finalfilename = this.targetstub + "." + this.fc + ".xml";
new File(this.targetdir, this.outputfilename).renameTo(new File(this.targetdir, finalfilename));
} catch (final IOException e) {
Log.logException(e);
}
}
Log.logInfo("WIKITRANSLATION", "*** convertWriter has terminated");
return Integer.valueOf(0);
}
}
public static void main(String[] s) {
public static void main(final String[] s) {
if (s.length == 0) {
Log.logInfo("WIKITRANSLATION", "usage:");
Log.logInfo("WIKITRANSLATION", " -index <wikipedia-dump>");
@ -751,47 +751,47 @@ public class MediawikiImporter extends Thread implements Importer {
// java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
if (s[0].equals("-convert") && s.length > 2) {
File sourcefile = new File(s[1]);
File targetdir = new File(s[2]);
final File sourcefile = new File(s[1]);
final File targetdir = new File(s[2]);
//String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
//String language = urlStub.substring(7,9);
try {
MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir);
final MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir);
mi.start();
mi.join();
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
}
if (s[0].equals("-index")) {
if (s[0].equals("-index")) {
try {
createIndex(new File(s[1]));
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
}
if (s[0].equals("-read")) {
long start = Integer.parseInt(s[1]);
int len = Integer.parseInt(s[2]);
final long start = Integer.parseInt(s[1]);
final int len = Integer.parseInt(s[2]);
System.out.println(UTF8.String(read(new File(s[3]), start, len)));
}
if (s[0].equals("-find")) {
try {
wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
if (w == null) {
Log.logInfo("WIKITRANSLATION", "not found");
} else {
System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start))));
}
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
}
System.exit(0);
}
}

@ -1,4 +1,4 @@
//bzipParser.java
//bzipParser.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
@ -42,26 +42,26 @@ import org.apache.tools.bzip2.CBZip2InputStream;
public class bzipParser extends AbstractParser implements Parser {
public bzipParser() {
public bzipParser() {
super("Bzip 2 UNIX Compressed File Parser");
SUPPORTED_EXTENSIONS.add("bz2");
SUPPORTED_EXTENSIONS.add("tbz");
SUPPORTED_EXTENSIONS.add("tbz2");
SUPPORTED_MIME_TYPES.add("application/x-bzip2");
SUPPORTED_MIME_TYPES.add("application/bzip2");
SUPPORTED_MIME_TYPES.add("application/x-bz2");
SUPPORTED_MIME_TYPES.add("application/x-bzip");
SUPPORTED_MIME_TYPES.add("application/x-stuffit");
this.SUPPORTED_EXTENSIONS.add("bz2");
this.SUPPORTED_EXTENSIONS.add("tbz");
this.SUPPORTED_EXTENSIONS.add("tbz2");
this.SUPPORTED_MIME_TYPES.add("application/x-bzip2");
this.SUPPORTED_MIME_TYPES.add("application/bzip2");
this.SUPPORTED_MIME_TYPES.add("application/x-bz2");
this.SUPPORTED_MIME_TYPES.add("application/x-bzip");
this.SUPPORTED_MIME_TYPES.add("application/x-stuffit");
}
public Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
File tempFile = null;
Document[] docs;
try {
try {
/*
* First we have to consume the first two char from the stream. Otherwise
* the bzip decompression will fail with a nullpointerException!
@ -73,31 +73,31 @@ public class bzipParser extends AbstractParser implements Parser {
b = source.read();
if (b != 'Z') {
throw new Exception("Invalid bz2 content.");
}
}
int read = 0;
final byte[] data = new byte[1024];
final CBZip2InputStream zippedContent = new CBZip2InputStream(source);
final CBZip2InputStream zippedContent = new CBZip2InputStream(source);
tempFile = File.createTempFile("bunzip","tmp");
tempFile.deleteOnExit();
// creating a temp file to store the uncompressed data
final FileOutputStream out = new FileOutputStream(tempFile);
// reading gzip file and store it uncompressed
while((read = zippedContent.read(data, 0, 1024)) != -1) {
out.write(data, 0, read);
}
zippedContent.close();
out.close();
// creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, tempFile);
} catch (final Exception e) {
docs = TextParser.parseSource(location, null, null, tempFile, false);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location);
} finally {
if (tempFile != null) FileUtils.deletedelete(tempFile);

@ -1,4 +1,4 @@
//gzipParser.java
//gzipParser.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
@ -42,52 +42,52 @@ import net.yacy.kelondro.util.FileUtils;
public class gzipParser extends AbstractParser implements Parser {
public gzipParser() {
public gzipParser() {
super("GNU Zip Compressed Archive Parser");
SUPPORTED_EXTENSIONS.add("gz");
SUPPORTED_EXTENSIONS.add("tgz");
SUPPORTED_MIME_TYPES.add("application/x-gzip");
SUPPORTED_MIME_TYPES.add("application/gzip");
SUPPORTED_MIME_TYPES.add("application/x-gunzip");
SUPPORTED_MIME_TYPES.add("application/gzipped");
SUPPORTED_MIME_TYPES.add("application/gzip-compressed");
SUPPORTED_MIME_TYPES.add("gzip/document");
this.SUPPORTED_EXTENSIONS.add("gz");
this.SUPPORTED_EXTENSIONS.add("tgz");
this.SUPPORTED_MIME_TYPES.add("application/x-gzip");
this.SUPPORTED_MIME_TYPES.add("application/gzip");
this.SUPPORTED_MIME_TYPES.add("application/x-gunzip");
this.SUPPORTED_MIME_TYPES.add("application/gzipped");
this.SUPPORTED_MIME_TYPES.add("application/gzip-compressed");
this.SUPPORTED_MIME_TYPES.add("gzip/document");
}
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null;
Document[] docs = null;
try {
try {
int read = 0;
final byte[] data = new byte[1024];
final GZIPInputStream zippedContent = new GZIPInputStream(source);
tempFile = File.createTempFile("gunzip","tmp");
tempFile.deleteOnExit();
// creating a temp file to store the uncompressed data
final FileOutputStream out = new FileOutputStream(tempFile);
// reading gzip file and store it uncompressed
while ((read = zippedContent.read(data, 0, 1024)) != -1) {
out.write(data, 0, read);
}
zippedContent.close();
out.close();
// creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location,null,null,tempFile);
} catch (final Exception e) {
docs = TextParser.parseSource(location,null,null,tempFile, false);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
} finally {
if (tempFile != null) FileUtils.deletedelete(tempFile);
}
return docs;
}
}

@ -32,20 +32,15 @@ import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.document.AbstractParser;
import net.yacy.document.Classification;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.util.FileUtils;
@ -96,78 +91,14 @@ public class htmlParser extends AbstractParser implements Parser {
try {
// first get a document from the parsed html
ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream);
Document document = transformScraper(location, mimeType, documentCharset, scraper);
// then produce virtual documents for each of the link that is contained in the document!
ArrayList<Document> docs = new ArrayList<Document>();
docs.add(document);
for (Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
addLinkDocs(docs, "application", link.getKey(), link.getValue(), scraper);
}
for (Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
addLinkDocs(docs, "audio", link.getKey(), link.getValue(), scraper);
}
for (Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
addLinkDocs(docs, "video", link.getKey(), link.getValue(), scraper);
}
for (Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
addImageDocs(docs, link.getValue());
}
// finally return the list of documents
return docs.toArray(new Document[docs.size()]);
final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream);
final Document document = transformScraper(location, mimeType, documentCharset, scraper);
return new Document[]{document};
} catch (final IOException e) {
throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
}
}
private final static void addLinkDocs(ArrayList<Document> docs, String type, MultiProtocolURI uri, String descr, ContentScraper scraper) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
final Document doc = new Document(
uri,
Classification.ext2mime(uri.getFileExtension()),
"UTF-8",
null,
scraper.getContentLanguages(),
null,
descr,
"",
"",
new String[]{descr},
type,
0.0f, 0.0f,
uri.toNormalform(false, false),
null,
null,
null,
false);
docs.add(doc);
}
private final static void addImageDocs(ArrayList<Document> docs, ImageEntry img) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
final Document doc = new Document(
img.url(),
Classification.ext2mime(img.url().getFileExtension()),
"UTF-8",
null,
null,
null,
img.alt(),
"",
"",
new String[]{img.alt()},
"image",
0.0f, 0.0f,
img.url().toNormalform(false, false),
null,
null,
null,
false);
docs.add(doc);
}
/**
* the transformScraper method transforms a scraper object into a document object
@ -211,7 +142,7 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.indexingDenied());
//scraper.close();
ppd.setFavicon(scraper.getFavicon());
return ppd;
}

@ -1,10 +1,10 @@
// sevenzipParser.java
// sevenzipParser.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
//
// This file ist contributed by Franz Brausze
//
// $LastChangedDate$
@ -15,12 +15,12 @@
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@ -40,7 +40,6 @@ import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream;
import SevenZip.Archive.IInArchive;
@ -48,13 +47,13 @@ import SevenZip.Archive.SevenZipEntry;
import SevenZip.Archive.SevenZip.Handler;
public class sevenzipParser extends AbstractParser implements Parser {
public sevenzipParser() {
super("7zip Archive Parser");
SUPPORTED_EXTENSIONS.add("7z");
SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
this.SUPPORTED_EXTENSIONS.add("7z");
this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
}
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException {
final Document doc = new Document(
location,
@ -68,7 +67,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
null,
null,
null,
0.0f, 0.0f,
0.0f, 0.0f,
(Object)null,
null,
null,
@ -86,7 +85,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
super.log.logFine("processing archive contents...");
try {
archive.Extract(null, -1, 0, aec);
return doc;
return doc;
} catch (final IOException e) {
if (e.getCause() instanceof InterruptedException)
throw (InterruptedException)e.getCause();
@ -99,7 +98,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
try { archive.close(); } catch (final IOException e) { }
}
}
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset,
final InputStream source) throws Parser.Failure, InterruptedException {
try {
@ -114,12 +113,12 @@ public class sevenzipParser extends AbstractParser implements Parser {
// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
// and parse the extracted content
private static class SZParserExtractCallback extends ArchiveExtractCallback {
private final Log log;
private ByteArrayOutputStream cfos = null;
private final Document doc;
private final String prefix;
public SZParserExtractCallback(final Log logger, final IInArchive handler,
final Document doc, final String prefix) {
super.Init(handler);
@ -127,7 +126,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
this.doc = doc;
this.prefix = prefix;
}
@Override
public void PrepareOperation(final int arg0) {
this.extractMode = (arg0 == IInArchive.NExtract_NAskMode_kExtract);
@ -143,7 +142,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
break;
}
}
@Override
public void SetOperationResult(final int arg0) throws IOException {
if (arg0 != IInArchive.NExtract_NOperationResult_kOK) {
@ -159,16 +158,16 @@ public class sevenzipParser extends AbstractParser implements Parser {
// throw new IOException("Unknown Error");
}
} else try {
if (this.cfos != null) {
// parse the file
Document[] theDocs;
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
final MultiProtocolURI url = MultiProtocolURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
final MultiProtocolURI url = MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray(), false);
this.doc.addSubDocuments(theDocs);
}
} catch (final Exception e) {
@ -177,7 +176,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
throw ex;
}
}
@Override
public OutputStream GetStream(final int index, final int askExtractMode) throws IOException {
final SevenZipEntry item = super.archiveHandler.getEntry(index);
@ -185,10 +184,10 @@ public class sevenzipParser extends AbstractParser implements Parser {
this.cfos = (item.isDirectory()) ? null : new ByteArrayOutputStream();
return this.cfos;
}
public String getCurrentFilePath() {
return super.filePath;
}
}
}

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -45,31 +45,31 @@ import org.apache.tools.tar.TarInputStream;
public class tarParser extends AbstractParser implements Parser {
public tarParser() {
super("Tape Archive File Parser");
SUPPORTED_EXTENSIONS.add("tar");
SUPPORTED_MIME_TYPES.add("application/x-tar");
SUPPORTED_MIME_TYPES.add("application/tar");
SUPPORTED_MIME_TYPES.add("applicaton/x-gtar");
SUPPORTED_MIME_TYPES.add("multipart/x-tar");
public tarParser() {
super("Tape Archive File Parser");
this.SUPPORTED_EXTENSIONS.add("tar");
this.SUPPORTED_MIME_TYPES.add("application/x-tar");
this.SUPPORTED_MIME_TYPES.add("application/tar");
this.SUPPORTED_MIME_TYPES.add("applicaton/x-gtar");
this.SUPPORTED_MIME_TYPES.add("multipart/x-tar");
}
public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException {
final List<Document> docacc = new ArrayList<Document>();
Document[] subDocs = null;
final String ext = url.getFileExtension().toLowerCase();
if (ext.equals("gz") || ext.equals("tgz")) {
try {
source = new GZIPInputStream(source);
} catch (IOException e) {
} catch (final IOException e) {
throw new Parser.Failure("tar parser: " + e.getMessage(), url);
}
}
TarEntry entry;
final TarInputStream tis = new TarInputStream(source);
final TarInputStream tis = new TarInputStream(source);
File tmp = null;
// loop through the elements in the tar file and parse every single file inside
while (true) {
try {
@ -83,16 +83,16 @@ public class tarParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp);
subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp, false);
if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) {
log.logWarning("tar parser entry " + name + ": " + e.getMessage());
this.log.logWarning("tar parser entry " + name + ": " + e.getMessage());
} finally {
if (tmp != null) FileUtils.deletedelete(tmp);
}
} catch (IOException e) {
log.logWarning("tar parser:" + e.getMessage());
} catch (final IOException e) {
this.log.logWarning("tar parser:" + e.getMessage());
break;
}
}

@ -87,7 +87,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize());
final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(virtualURL, mime, null, tmp);
docs = TextParser.parseSource(virtualURL, mime, null, tmp, false);
if (docs == null) continue;
for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) {

@ -367,7 +367,7 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.mime());
if (supportError != null) throw new IOException("no parser support: " + supportError);
try {
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent());
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent(), false);
if (documents == null) throw new IOException("document == null");
} catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage());

@ -1,29 +1,30 @@
package de.anomic.document;
import static org.junit.Assert.*;
import static org.junit.matchers.JUnitMatchers.*;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import org.junit.Test;
import static org.junit.Assert.assertThat;
import static org.junit.matchers.JUnitMatchers.containsString;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import org.junit.Test;
public class ParserTest {
@Test public void testParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
final String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""},
new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"},
@ -34,26 +35,26 @@ public class ParserTest {
};
for (int i=0; i < testFiles.length; i++) {
String filename = "test/parsertest/" + testFiles[i][0];
File file = new File(filename);
String mimetype = testFiles[i][1];
DigestURI url = new DigestURI("http://localhost/"+filename);
for (final String[] testFile : testFiles) {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file));
for (Document doc: docs) {
Reader content = new InputStreamReader(doc.getText(), doc.getCharset());
StringBuilder str = new StringBuilder();
final Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file), true);
for (final Document doc: docs) {
final Reader content = new InputStreamReader(doc.getText(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFiles[i][2]));
assertThat(doc.dc_creator(), containsString(testFiles[i][3]));
assertThat(doc.dc_description(), containsString(testFiles[i][4]));
}
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4]));
}
}
}
}

Loading…
Cancel
Save