added new configuration property "crawler.embedLinksAsDocuments". If this is switched on (this is default now), the all embedded image, audio and video links from all parsed documents are added to the search index as individual document. This will increase the search index size dramatically but will also enable us to create a much faster image, audio and video search. If the flag is switched on, the index entries are also stored to a solr index, if this is also enabled.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7931 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent e02bfbde56
commit 49e5ca579f

@ -704,6 +704,10 @@ crawler.file.maxFileSize=100000000
# maximum number of crawler threads
crawler.MaxActiveThreads = 200
# flag: consider all embedded image/audio/video document links
# from all crawled documents as its own document
crawler.embedLinksAsDocuments = true
# maximum size of indexing queue
indexer.slots = 100

@ -213,11 +213,11 @@ public class Response {
public Date lastModified() {
Date docDate = null;
if (responseHeader != null) {
docDate = responseHeader.lastModified();
if (docDate == null) docDate = responseHeader.date();
if (this.responseHeader != null) {
docDate = this.responseHeader.lastModified();
if (docDate == null) docDate = this.responseHeader.date();
}
if (docDate == null && request != null) docDate = request.appdate();
if (docDate == null && this.request != null) docDate = this.request.appdate();
if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime());
return docDate;
@ -226,7 +226,7 @@ public class Response {
public String language() {
// please avoid this method if a condenser document is available, because the condenser has a built-in language detection
// this here is only a guess using the TLD
return this.url().language();
return url().language();
}
public CrawlProfile profile() {
@ -272,7 +272,7 @@ public class Response {
*/
public String shallStoreCacheForProxy() {
String crawlerReason = shallStoreCacheForCrawler();
final String crawlerReason = shallStoreCacheForCrawler();
if (crawlerReason != null) return crawlerReason;
// check profile (disabled: we will check this in the plasmaSwitchboard)
@ -285,19 +285,19 @@ public class Response {
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable
// in caches
if (this.url().isPOST() && this.profile != null && !this.profile.crawlingQ()) {
if (url().isPOST() && this.profile != null && !this.profile.crawlingQ()) {
return "dynamic_post";
}
if (this.url().isCGI()) {
if (url().isCGI()) {
return "dynamic_cgi";
}
if (this.url().isLocal()) {
if (url().isLocal()) {
return "local_URL_no_cache_needed";
}
if (responseHeader != null) {
if (this.responseHeader != null) {
// -if-modified-since in request
// we do not care about if-modified-since, because this case only occurres if the
@ -315,7 +315,7 @@ public class Response {
// -pragma in response
// if we have a pragma non-cache, we don't cache. usually if this is wanted from
// the server, it makes sense
String cacheControl = responseHeader.get(HeaderFramework.PRAGMA);
String cacheControl = this.responseHeader.get(HeaderFramework.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; }
// -expires in response
@ -324,12 +324,12 @@ public class Response {
// -cache-control in response
// the cache-control has many value options.
cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL);
cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
final Date date = responseHeader.date();
final Date date = this.responseHeader.date();
if (date == null) return "stale_no_date_given_in_response";
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
@ -349,27 +349,27 @@ public class Response {
public String shallStoreCacheForCrawler() {
// check storage size: all files will be handled in RAM before storage, so they must not exceed
// a given size, which we consider as 1MB
if (this.size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + this.size();
if (size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + size();
// check status code
if (!validResponseStatus()) {
return "bad_status_" + this.responseStatus;
}
if (requestHeader != null) {
if (this.requestHeader != null) {
// -authorization cases in request
// authorization makes pages very individual, and therefore we cannot use the
// content in the cache
if (requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return "personalized"; }
if (this.requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return "personalized"; }
// -ranges in request and response
// we do not cache partial content
if (requestHeader.containsKey(HeaderFramework.RANGE)) { return "partial_request"; }
if (this.requestHeader.containsKey(HeaderFramework.RANGE)) { return "partial_request"; }
}
if (responseHeader != null) {
if (this.responseHeader != null) {
// -ranges in request and response
// we do not cache partial content
if (responseHeader.containsKey(HeaderFramework.CONTENT_RANGE)) { return "partial_response"; }
if (this.responseHeader.containsKey(HeaderFramework.CONTENT_RANGE)) { return "partial_response"; }
}
return null;
}
@ -385,27 +385,27 @@ public class Response {
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable
// in caches
if (this.url().isPOST()) {
if (url().isPOST()) {
return false;
}
if (this.url().isCGI()) {
if (url().isCGI()) {
return false;
}
String cacheControl;
if (requestHeader != null) {
if (this.requestHeader != null) {
// -authorization cases in request
if (requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return false; }
if (this.requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return false; }
// -ranges in request
// we do not cache partial content
if (requestHeader.containsKey(HeaderFramework.RANGE)) { return false; }
if (this.requestHeader.containsKey(HeaderFramework.RANGE)) { return false; }
// if the client requests a un-cached copy of the resource ...
cacheControl = requestHeader.get(HeaderFramework.PRAGMA);
cacheControl = this.requestHeader.get(HeaderFramework.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
cacheControl = requestHeader.get(HeaderFramework.CACHE_CONTROL);
cacheControl = this.requestHeader.get(HeaderFramework.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; }
@ -414,14 +414,14 @@ public class Response {
// -if-modified-since in request
// The entity has to be transferred only if it has
// been modified since the date given by the If-Modified-Since header.
if (requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) {
if (this.requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if (!responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; }
if (!this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; }
// parse date
Date d1, d2;
d2 = responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); }
d1 = requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); }
d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); }
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); }
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) { return false; }
}
@ -433,20 +433,20 @@ public class Response {
// but we think that pictures can still be considered as fresh
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if (requestHeader.containsKey(RequestHeader.COOKIE) ||
responseHeader.containsKey(HeaderFramework.SET_COOKIE) ||
responseHeader.containsKey(HeaderFramework.SET_COOKIE2)) {
if (this.requestHeader.containsKey(RequestHeader.COOKIE) ||
this.responseHeader.containsKey(HeaderFramework.SET_COOKIE) ||
this.responseHeader.containsKey(HeaderFramework.SET_COOKIE2)) {
return false; // too strong
}
}
}
if (responseHeader != null) {
if (this.responseHeader != null) {
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
cacheControl = responseHeader.get(HeaderFramework.PRAGMA);
cacheControl = this.responseHeader.get(HeaderFramework.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
// see for documentation also:
@ -459,13 +459,13 @@ public class Response {
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
final Date expires = responseHeader.expires();
final Date expires = this.responseHeader.expires();
if (expires != null) {
// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
if (expires.before(new Date(GenericFormatter.correctedUTCTime()))) { return false; }
}
final Date lastModified = responseHeader.lastModified();
cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL);
final Date lastModified = this.responseHeader.lastModified();
cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL);
if (cacheControl == null && lastModified == null && expires == null) { return false; }
// -lastModified in cached response
@ -474,7 +474,7 @@ public class Response {
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
Date date = responseHeader.date();
Date date = this.responseHeader.date();
if (lastModified != null) {
if (date == null) { date = new Date(GenericFormatter.correctedUTCTime()); }
final long age = date.getTime() - lastModified.getTime();
@ -530,7 +530,7 @@ public class Response {
// check profile
if (!profile().indexText() && !profile().indexMedia()) {
return "indexing not allowed - indexText and indexMedia not set (for proxy = " + profile.name()+ ")";
return "indexing not allowed - indexText and indexMedia not set (for proxy = " + this.profile.name()+ ")";
}
// -CGI access in request
@ -565,19 +565,19 @@ public class Response {
return "Dynamic_(Requested_With_Cookie)";
}
if (responseHeader != null) {
if (this.responseHeader != null) {
// -set-cookie in response
// the set-cookie from the server does not indicate that the content is special
// thus we do not care about it here for indexing
// a picture cannot be indexed
final String mimeType = responseHeader.mime();
final String mimeType = this.responseHeader.mime();
/*
if (Classification.isPictureMime(mimeType)) {
return "Media_Content_(Picture)";
}
*/
String parserError = TextParser.supportsMime(mimeType);
final String parserError = TextParser.supportsMime(mimeType);
if (parserError != null) {
return "Media_Content, no parser: " + parserError;
}
@ -585,9 +585,9 @@ public class Response {
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
final Date ifModifiedSince = this.requestHeader.ifModifiedSince();
if ((ifModifiedSince != null) && (responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
// parse date
Date d = responseHeader.lastModified();
Date d = this.responseHeader.lastModified();
if (d == null) {
d = new Date(GenericFormatter.correctedUTCTime());
}
@ -599,8 +599,8 @@ public class Response {
}
// -pragma in cached response
if (responseHeader.containsKey(HeaderFramework.PRAGMA) &&
(responseHeader.get(HeaderFramework.PRAGMA)).toUpperCase().equals("NO-CACHE")) {
if (this.responseHeader.containsKey(HeaderFramework.PRAGMA) &&
(this.responseHeader.get(HeaderFramework.PRAGMA)).toUpperCase().equals("NO-CACHE")) {
return "Denied_(pragma_no_cache)";
}
@ -613,7 +613,7 @@ public class Response {
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
final Date expires = responseHeader.expires();
final Date expires = this.responseHeader.expires();
if (expires != null && expires.before(new Date(GenericFormatter.correctedUTCTime()))) {
return "Stale_(Expired)";
}
@ -624,7 +624,7 @@ public class Response {
// -cache-control in cached response
// the cache-control has many value options.
String cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL);
String cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
/* we have the following cases for cache-control:
@ -641,7 +641,7 @@ public class Response {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
final Date date = responseHeader.date();
final Date date = this.responseHeader.date();
if (date == null) {
return "Stale_(no_date_given_in_response)";
}
@ -675,7 +675,7 @@ public class Response {
// check profile
if (!profile().indexText() && !profile().indexMedia()) {
return "indexing not allowed - indexText and indexMedia not set (for crawler = " + profile.name() + ")";
return "indexing not allowed - indexText and indexMedia not set (for crawler = " + this.profile.name() + ")";
}
// -CGI access in request
@ -692,9 +692,9 @@ public class Response {
// we checked that in shallStoreCache
// check if document can be indexed
if (responseHeader != null) {
final String mimeType = responseHeader.mime();
String parserError = TextParser.supportsMime(mimeType);
if (this.responseHeader != null) {
final String mimeType = this.responseHeader.mime();
final String parserError = TextParser.supportsMime(mimeType);
if (parserError != null && TextParser.supportsExtension(url()) != null) return "no parser available: " + parserError;
}
/*
@ -741,9 +741,9 @@ public class Response {
}
public String getMimeType() {
if (responseHeader == null) return null;
if (this.responseHeader == null) return null;
String mimeType = responseHeader.mime();
String mimeType = this.responseHeader.mime();
mimeType = mimeType.trim().toLowerCase();
final int pos = mimeType.indexOf(';');
@ -751,14 +751,14 @@ public class Response {
}
public String getCharacterEncoding() {
if (responseHeader == null) return null;
return responseHeader.getCharacterEncoding();
if (this.responseHeader == null) return null;
return this.responseHeader.getCharacterEncoding();
}
public DigestURI referrerURL() {
if (requestHeader == null) return null;
if (this.requestHeader == null) return null;
try {
String r = requestHeader.get(RequestHeader.REFERER, null);
final String r = this.requestHeader.get(RequestHeader.REFERER, null);
if (r == null) return null;
return new DigestURI(r);
} catch (final Exception e) {
@ -767,8 +767,8 @@ public class Response {
}
public byte[] referrerHash() {
if (requestHeader == null) return null;
String u = requestHeader.get(RequestHeader.REFERER, "");
if (this.requestHeader == null) return null;
final String u = this.requestHeader.get(RequestHeader.REFERER, "");
if (u == null || u.length() == 0) return null;
try {
return new DigestURI(u).hash();
@ -778,25 +778,25 @@ public class Response {
}
public boolean validResponseStatus() {
return (responseStatus == null) ? false : responseStatus.startsWith("200") || responseStatus.startsWith("203");
return (this.responseStatus == null) ? false : this.responseStatus.startsWith("200") || this.responseStatus.startsWith("203");
}
public Date ifModifiedSince() {
return (requestHeader == null) ? null : requestHeader.ifModifiedSince();
return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince();
}
public boolean requestWithCookie() {
return (requestHeader == null) ? false : requestHeader.containsKey(RequestHeader.COOKIE);
return (this.requestHeader == null) ? false : this.requestHeader.containsKey(RequestHeader.COOKIE);
}
public boolean requestProhibitsIndexing() {
return (requestHeader == null)
return (this.requestHeader == null)
? false
: requestHeader.containsKey(HeaderFramework.X_YACY_INDEX_CONTROL) &&
(requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
: this.requestHeader.containsKey(HeaderFramework.X_YACY_INDEX_CONTROL) &&
(this.requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
}
public EventOrigin processCase(String mySeedHash) {
public EventOrigin processCase(final String mySeedHash) {
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
@ -820,11 +820,11 @@ public class Response {
}
public Document[] parse() throws Parser.Failure {
String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content);
} catch (Exception e) {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content, false);
} catch (final Exception e) {
return null;
}

@ -35,7 +35,6 @@ import java.util.Date;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -61,18 +60,18 @@ public class DocumentIndex extends Segment {
static {
try {
poison = new DigestURI("file://.");
} catch (MalformedURLException e) {}
} catch (final MalformedURLException e) {}
}
BlockingQueue<DigestURI> queue; // a queue of document ID's
private Worker[] worker;
private final Worker[] worker;
CallbackListener callback;
static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup");
public DocumentIndex(final File segmentPath, CallbackListener callback, int cachesize) throws IOException {
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) throws IOException {
super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false);
int cores = Runtime.getRuntime().availableProcessors() + 1;
final int cores = Runtime.getRuntime().availableProcessors() + 1;
this.callback = callback;
this.queue = new LinkedBlockingQueue<DigestURI>(cores * 300);
this.worker = new Worker[cores];
@ -83,29 +82,31 @@ public class DocumentIndex extends Segment {
}
class Worker extends Thread {
public Worker(int count) {
public Worker(final int count) {
super(workerThreadGroup, "query-" + count);
}
@Override
public void run() {
DigestURI f;
URIMetadataRow resultRow;
URIMetadataRow[] resultRows;
try {
while ((f = queue.take()) != poison) try {
resultRow = add(f);
if (callback != null) {
while ((f = DocumentIndex.this.queue.take()) != poison) try {
resultRows = add(f);
for (final URIMetadataRow resultRow: resultRows) {
if (DocumentIndex.this.callback != null) {
if (resultRow == null) {
callback.fail(f, "result is null");
DocumentIndex.this.callback.fail(f, "result is null");
} else {
callback.commit(f, resultRow);
DocumentIndex.this.callback.commit(f, resultRow);
}
}
}
} catch (IOException e) {
} catch (final IOException e) {
if (e.getMessage().indexOf("cannot parse") < 0) Log.logException(e);
callback.fail(f, e.getMessage());
DocumentIndex.this.callback.fail(f, e.getMessage());
}
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
}
@ -120,7 +121,7 @@ public class DocumentIndex extends Segment {
this.queue.clear();
}
private URIMetadataRow add(DigestURI url) throws IOException {
private URIMetadataRow[] add(final DigestURI url) throws IOException {
if (url == null) throw new IOException("file = null");
if (url.isDirectory()) throw new IOException("file should be a document, not a path");
if (!url.canRead()) throw new IOException("cannot read file");
@ -128,17 +129,20 @@ public class DocumentIndex extends Segment {
long length;
try {
length = url.length();
} catch (Exception e) {
} catch (final Exception e) {
length = -1;
}
try {
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1));
} catch (Exception e) {
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1), true);
} catch (final Exception e) {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
}
Document document = Document.mergeDocuments(url, null, documents);
//Document document = Document.mergeDocuments(url, null, documents);
final URIMetadataRow[] rows = new URIMetadataRow[documents.length];
int c = 0;
for (final Document document: documents) {
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
return super.storeDocument(
rows[c++] = super.storeDocument(
url,
null,
new Date(url.lastModified()),
@ -150,24 +154,26 @@ public class DocumentIndex extends Segment {
DocumentIndex.class.getName() + ".add"
);
}
return rows;
}
/**
* add a file or a directory of files to the index
* If the given file is a path to a directory, the complete sub-tree is indexed
* @param start
*/
public void addConcurrent(DigestURI start) throws IOException {
public void addConcurrent(final DigestURI start) throws IOException {
assert (start != null);
assert (start.canRead()) : start.toString();
if (!start.isDirectory()) {
try {
this.queue.put(start);
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
return;
}
String[] s = start.list();
final String[] s = start.list();
DigestURI w;
for (String t: s) {
for (final String t: s) {
try {
w = new DigestURI(start, t);
if (w.canRead() && !w.isHidden()) {
@ -176,10 +182,10 @@ public class DocumentIndex extends Segment {
} else {
try {
this.queue.put(w);
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
}
} catch (MalformedURLException e1) {
} catch (final MalformedURLException e1) {
Log.logException(e1);
}
}
@ -191,16 +197,16 @@ public class DocumentIndex extends Segment {
* @param count
* @return a list of files that contain the given string
*/
public ArrayList<DigestURI> find(String querystring, int count) {
public ArrayList<DigestURI> find(final String querystring, int count) {
// make a query and start a search
QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex");
ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang));
RankingProcess rankedCache = new RankingProcess(query, order, SearchEvent.max_results_preparation);
final QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex");
final ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang));
final RankingProcess rankedCache = new RankingProcess(query, order, SearchEvent.max_results_preparation);
rankedCache.start();
// search is running; retrieve results
URIMetadataRow row;
ArrayList<DigestURI> files = new ArrayList<DigestURI>();
final ArrayList<DigestURI> files = new ArrayList<DigestURI>();
Components metadata;
while ((row = rankedCache.takeURL(false, 1000)) != null) {
metadata = row.metadata();
@ -219,16 +225,16 @@ public class DocumentIndex extends Segment {
@Override
public void close() {
// send termination signal to worker threads
for (int i = 0; i < this.worker.length; i++) {
for (final Worker element : this.worker) {
try {
this.queue.put(poison);
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
// wait for termination
for (int i = 0; i < this.worker.length; i++) {
for (final Worker element : this.worker) {
try {
this.worker[i].join();
} catch (InterruptedException e) {}
element.join();
} catch (final InterruptedException e) {}
}
// close the segment
super.close();
@ -239,7 +245,7 @@ public class DocumentIndex extends Segment {
public void fail(DigestURI f, String failReason);
}
public static void main(String[] args) {
public static void main(final String[] args) {
// first argument: path to segment
// second argument: either 'add' or 'search'
// third and more arguments exists only in case that second argument is 'search': these are then the search words
@ -249,34 +255,34 @@ public class DocumentIndex extends Segment {
// DocumentIndex yacyindex search steht
System.setProperty("java.awt.headless", "true");
if (args.length < 3) return;
File segmentPath = new File(args[0]);
final File segmentPath = new File(args[0]);
System.out.println("using index files at " + segmentPath.getAbsolutePath());
CallbackListener callback = new CallbackListener() {
public void commit(DigestURI f, URIMetadataRow resultRow) {
final CallbackListener callback = new CallbackListener() {
public void commit(final DigestURI f, final URIMetadataRow resultRow) {
System.out.println("indexed: " + f.toString());
}
public void fail(DigestURI f, String failReason) {
public void fail(final DigestURI f, final String failReason) {
System.out.println("not indexed " + f.toString() + ": " + failReason);
}
};
try {
if (args[1].equals("add")) {
DigestURI f = new DigestURI(args[2]);
DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
final DigestURI f = new DigestURI(args[2]);
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
di.addConcurrent(f);
di.close();
} else {
String query = "";
for (int i = 2; i < args.length; i++) query += args[i];
query.trim();
DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
ArrayList<DigestURI> results = di.find(query, 100);
for (DigestURI f: results) {
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
final ArrayList<DigestURI> results = di.find(query, 100);
for (final DigestURI f: results) {
if (f != null) System.out.println(f.toString());
}
di.close();
}
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
//System.exit(0);

@ -1921,7 +1921,7 @@ public final class Switchboard extends serverSwitch {
assert response.getContent() != null;
try {
// parse the document
documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent());
documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), getConfigBool("crawler.embedLinksAsDocuments", false));
if (documents == null) {
throw new Parser.Failure("Parser returned null.", response.url());
}

@ -39,7 +39,7 @@ public abstract class AbstractParser implements Parser {
* initialize a parser with a name
* @param name
*/
public AbstractParser(String name) {
public AbstractParser(final String name) {
this.name = name;
}
@ -71,8 +71,8 @@ public abstract class AbstractParser implements Parser {
* @param o
* @return
*/
public boolean equals(Object o) {
return this.getName().equals(((Parser) o).getName());
public boolean equals(final Object o) {
return getName().equals(((Parser) o).getName());
}
/**
@ -80,7 +80,7 @@ public abstract class AbstractParser implements Parser {
* @return the hash code of the parser name string
*/
public int hashCode() {
return this.getName().hashCode();
return getName().hashCode();
}
}

@ -131,6 +131,10 @@ public class Document {
return this.parserObject;
}
public Set<String> getContentLanguages() {
return this.languages;
}
/**
* compute a set of languages that this document contains
* the language is not computed using a statistical analysis of the content, only from given metadata that came with the document

@ -31,6 +31,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@ -58,6 +59,7 @@ import net.yacy.document.parser.vcfParser;
import net.yacy.document.parser.vsdParser;
import net.yacy.document.parser.xlsParser;
import net.yacy.document.parser.zipParser;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -141,7 +143,8 @@ public final class TextParser {
final MultiProtocolURI location,
final String mimeType,
final String charset,
final File sourceFile
final File sourceFile,
final boolean multipleVirtualDocs
) throws InterruptedException, Parser.Failure {
BufferedInputStream sourceStream = null;
@ -154,7 +157,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location);
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream, multipleVirtualDocs);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -164,6 +167,7 @@ public final class TextParser {
if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {}
}
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
}
@ -171,7 +175,8 @@ public final class TextParser {
final MultiProtocolURI location,
String mimeType,
final String charset,
final byte[] content
final byte[] content,
final boolean multipleVirtualDocs
) throws Parser.Failure {
if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
mimeType = normalizeMimeType(mimeType);
@ -185,7 +190,12 @@ public final class TextParser {
}
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true, false);
return parseSource(location, mimeType, idioms, charset, content);
Document[] docs = parseSource(location, mimeType, idioms, charset, content);
// finally enrich the docs set with virtual docs from the enclosed documents
if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]);
return docs;
}
public static Document[] parseSource(
@ -193,7 +203,8 @@ public final class TextParser {
String mimeType,
final String charset,
final long contentLength,
final InputStream sourceStream
final InputStream sourceStream,
final boolean multipleVirtualDocs
) throws Parser.Failure {
if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType);
@ -222,7 +233,12 @@ public final class TextParser {
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
return parseSource(location, mimeType, idioms, charset, b);
Document[] docs = parseSource(location, mimeType, idioms, charset, b);
// finally enrich the docs set with virtual docs from the enclosed documents
if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]);
return docs;
}
private static Document[] parseSource(
@ -292,6 +308,7 @@ public final class TextParser {
}
}
for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
return docs;
}
@ -429,4 +446,73 @@ public final class TextParser {
if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v);
}
/**
* produce virtual documents for each of the link that is contained in the document
* @param document
* @return
*/
public static Document[] virtualDocs(final Document document) {
final ArrayList<Document> docs = new ArrayList<Document>();
docs.add(document);
for (final Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
docs.add(genLinkDocs(docs, "application", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
docs.add(genLinkDocs(docs, "audio", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
docs.add(genLinkDocs(docs, "video", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
docs.add(genImageDocs(docs, link.getValue()));
}
// finally return the list of documents
return docs.toArray(new Document[docs.size()]);
}
private final static Document genLinkDocs(final ArrayList<Document> docs, final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
return new Document(
uri,
Classification.ext2mime(uri.getFileExtension()),
"UTF-8",
null,
contentLanguages,
null,
descr,
"",
"",
new String[]{descr},
type,
0.0f, 0.0f,
uri.toNormalform(false, false),
null,
null,
null,
false);
}
private final static Document genImageDocs(final ArrayList<Document> docs, final ImageEntry img) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
return new Document(
img.url(),
Classification.ext2mime(img.url().getFileExtension()),
"UTF-8",
null,
null,
null,
img.alt(),
"",
"",
new String[]{img.alt()},
"image",
0.0f, 0.0f,
img.url().toNormalform(false, false),
null,
null,
null,
false);
}
}

@ -22,17 +22,6 @@
package net.yacy.document.importer;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.content.SurrogateReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
import org.apache.tools.bzip2.CBZip2InputStream;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
@ -61,6 +50,17 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.content.SurrogateReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
import org.apache.tools.bzip2.CBZip2InputStream;
import de.anomic.data.wiki.WikiCode;
import de.anomic.data.wiki.WikiParser;
@ -90,10 +90,10 @@ public class MediawikiImporter extends Thread implements Importer {
private String hostport, urlStub;
public MediawikiImporter(File sourcefile, File targetdir) {
public MediawikiImporter(final File sourcefile, final File targetdir) {
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir;
this.count = 0;
this.start = 0;
@ -118,8 +118,8 @@ public class MediawikiImporter extends Thread implements Importer {
* @return
*/
public int speed() {
if (count == 0) return 0;
return (int) ((long) count / Math.max(1L, runningTime() ));
if (this.count == 0) return 0;
return (int) (this.count / Math.max(1L, runningTime() ));
}
/**
@ -127,61 +127,61 @@ public class MediawikiImporter extends Thread implements Importer {
* @return
*/
public long remainingTime() {
return Math.max(0, this.approxdocs - count) / Math.max(1, speed() );
return Math.max(0, this.approxdocs - this.count) / Math.max(1, speed() );
}
public long runningTime() {
return (System.currentTimeMillis() - start) / 1000L;
return (System.currentTimeMillis() - this.start) / 1000L;
}
public void run() {
this.start = System.currentTimeMillis();
try {
String targetstub = sourcefile.getName();
String targetstub = this.sourcefile.getName();
int p = targetstub.lastIndexOf("\\.");
if (p > 0) targetstub = targetstub.substring(0, p);
InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1024 * 1024);
if (sourcefile.getName().endsWith(".bz2")) {
InputStream is = new BufferedInputStream(new FileInputStream(this.sourcefile), 1024 * 1024);
if (this.sourcefile.getName().endsWith(".bz2")) {
int b = is.read();
if (b != 'B') throw new IOException("Invalid bz2 content.");
b = is.read();
if (b != 'Z') throw new IOException("Invalid bz2 content.");
is = new CBZip2InputStream(is);
} else if (sourcefile.getName().endsWith(".gz")) {
} else if (this.sourcefile.getName().endsWith(".gz")) {
is = new GZIPInputStream(is);
}
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
final BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
String t;
StringBuilder sb = new StringBuilder();
boolean page = false, text = false;
String title = null;
wikiparserrecord poison = newRecord();
int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
ExecutorService service = Executors.newFixedThreadPool(threads + 1);
convertConsumer[] consumers = new convertConsumer[threads];
Future<?>[] consumerResults = new Future[threads];
final wikiparserrecord poison = newRecord();
final int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
final BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
final BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
final ExecutorService service = Executors.newFixedThreadPool(threads + 1);
final convertConsumer[] consumers = new convertConsumer[threads];
final Future<?>[] consumerResults = new Future[threads];
for (int i = 0; i < threads; i++) {
consumers[i] = new convertConsumer(in, out, poison);
consumerResults[i] = service.submit(consumers[i]);
}
convertWriter writer = new convertWriter(out, poison, targetdir, targetstub);
Future<Integer> writerResult = service.submit(writer);
final convertWriter writer = new convertWriter(out, poison, this.targetdir, targetstub);
final Future<Integer> writerResult = service.submit(writer);
wikiparserrecord record;
int q;
while ((t = r.readLine()) != null) {
if ((p = t.indexOf("<base>")) >= 0 && (q = t.indexOf("</base>", p)) > 0) {
//urlStub = "http://" + lang + ".wikipedia.org/wiki/";
urlStub = t.substring(p + 6, q);
if (!urlStub.endsWith("/")) {
q = urlStub.lastIndexOf('/');
if (q > 0) urlStub = urlStub.substring(0, q + 1);
}
DigestURI uri = new DigestURI(urlStub);
hostport = uri.getHost();
if (uri.getPort() != 80) hostport += ":" + uri.getPort();
this.urlStub = t.substring(p + 6, q);
if (!this.urlStub.endsWith("/")) {
q = this.urlStub.lastIndexOf('/');
if (q > 0) this.urlStub = this.urlStub.substring(0, q + 1);
}
final DigestURI uri = new DigestURI(this.urlStub);
this.hostport = uri.getHost();
if (uri.getPort() != 80) this.hostport += ":" + uri.getPort();
continue;
}
if (t.indexOf(pagestart) >= 0) {
@ -192,7 +192,7 @@ public class MediawikiImporter extends Thread implements Importer {
text = page;
q = t.indexOf('>', p + textstart.length());
if (q > 0) {
int u = t.indexOf(textend, q + 1);
final int u = t.indexOf(textend, q + 1);
if (u > q) {
sb.append(t.substring(q + 1, u));
Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title);
@ -200,11 +200,11 @@ public class MediawikiImporter extends Thread implements Importer {
Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
continue;
}
record = newRecord(hostport, urlStub, title, sb);
record = newRecord(this.hostport, this.urlStub, title, sb);
try {
in.put(record);
this.count++;
} catch (InterruptedException e1) {
} catch (final InterruptedException e1) {
Log.logException(e1);
}
sb = new StringBuilder(200);
@ -222,11 +222,11 @@ public class MediawikiImporter extends Thread implements Importer {
Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
continue;
}
record = newRecord(hostport, urlStub, title, sb);
record = newRecord(this.hostport, this.urlStub, title, sb);
try {
in.put(record);
this.count++;
} catch (InterruptedException e1) {
} catch (final InterruptedException e1) {
Log.logException(e1);
}
sb = new StringBuilder(200);
@ -258,24 +258,24 @@ public class MediawikiImporter extends Thread implements Importer {
}
out.put(poison);
writerResult.get(10000, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
} catch (ExecutionException e) {
} catch (final ExecutionException e) {
Log.logException(e);
} catch (TimeoutException e) {
} catch (final TimeoutException e) {
Log.logException(e);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
}
public static void checkIndex(File mediawikixml) {
File idx = idxFromMediawikiXML(mediawikixml);
public static void checkIndex(final File mediawikixml) {
final File idx = idxFromMediawikiXML(mediawikixml);
if (idx.exists()) return;
new indexMaker(mediawikixml).start();
}
@ -283,7 +283,7 @@ public class MediawikiImporter extends Thread implements Importer {
public static class indexMaker extends Thread {
File mediawikixml;
public indexMaker(File mediawikixml) {
public indexMaker(final File mediawikixml) {
this.mediawikixml = mediawikixml;
}
@ -297,21 +297,21 @@ public class MediawikiImporter extends Thread implements Importer {
}
}
public static File idxFromMediawikiXML(File mediawikixml) {
public static File idxFromMediawikiXML(final File mediawikixml) {
return new File(mediawikixml.getAbsolutePath() + ".idx.xml");
}
public static void createIndex(File dumpFile) throws IOException {
public static void createIndex(final File dumpFile) throws IOException {
// calculate md5
//String md5 = serverCodings.encodeMD5Hex(dumpFile);
// init reader, producer and consumer
PositionAwareReader in = new PositionAwareReader(dumpFile);
indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile));
wikiConsumer consumer = new wikiConsumer(100, producer);
ExecutorService service = Executors.newFixedThreadPool(2);
Future<Integer> producerResult = service.submit(consumer);
Future<Integer> consumerResult = service.submit(producer);
final PositionAwareReader in = new PositionAwareReader(dumpFile);
final indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile));
final wikiConsumer consumer = new wikiConsumer(100, producer);
final ExecutorService service = Executors.newFixedThreadPool(2);
final Future<Integer> producerResult = service.submit(consumer);
final Future<Integer> consumerResult = service.submit(producer);
service.shutdown();
// read the wiki dump
@ -328,14 +328,14 @@ public class MediawikiImporter extends Thread implements Importer {
// shut down the services
try {
consumer.consume(wikiConsumer.poison);
try {consumerResult.get(5000, TimeUnit.MILLISECONDS);} catch (TimeoutException e) {}
try {consumerResult.get(5000, TimeUnit.MILLISECONDS);} catch (final TimeoutException e) {}
producer.consume(indexProducer.poison);
if (!consumerResult.isDone()) consumerResult.get();
producerResult.get();
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
return;
} catch (ExecutionException e) {
} catch (final ExecutionException e) {
Log.logException(e);
return;
}
@ -349,18 +349,18 @@ public class MediawikiImporter extends Thread implements Importer {
protected static wikisourcerecord poison = new wikisourcerecord("", 0, 0);
int count;
public indexProducer(int bufferCount, File indexFile) throws IOException {
entries = new ArrayBlockingQueue<wikisourcerecord>(bufferCount);
out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile)));
count = 0;
out.println("<index>");
public indexProducer(final int bufferCount, final File indexFile) throws IOException {
this.entries = new ArrayBlockingQueue<wikisourcerecord>(bufferCount);
this.out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile)));
this.count = 0;
this.out.println("<index>");
}
public void consume(wikisourcerecord b) {
public void consume(final wikisourcerecord b) {
try {
entries.put(b);
} catch (InterruptedException e) {
this.entries.put(b);
} catch (final InterruptedException e) {
Log.logException(e);
}
}
@ -369,24 +369,24 @@ public class MediawikiImporter extends Thread implements Importer {
wikisourcerecord r;
try {
while(true) {
r = entries.take();
r = this.entries.take();
if (r == poison) {
Log.logInfo("WIKITRANSLATION", "producer / got poison");
break;
}
out.println(" <page start=\"" + r.start + "\" length=\"" + (r.end - r.start) + "\">");
out.println(" <title>" + r.title + "</title>");
out.println(" </page>");
this.out.println(" <page start=\"" + r.start + "\" length=\"" + (r.end - r.start) + "\">");
this.out.println(" <title>" + r.title + "</title>");
this.out.println(" </page>");
Log.logInfo("WIKITRANSLATION", "producer / record start: " + r.start + ", title : " + r.title);
count++;
this.count++;
}
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
entries.clear();
out.println("</index>");
out.close();
return Integer.valueOf(count);
this.entries.clear();
this.out.println("</index>");
this.out.close();
return Integer.valueOf(this.count);
}
}
@ -398,16 +398,16 @@ public class MediawikiImporter extends Thread implements Importer {
private final indexProducer producer;
private int count;
public wikiConsumer(int bufferCount, indexProducer producer) {
entries = new ArrayBlockingQueue<wikiraw>(bufferCount);
public wikiConsumer(final int bufferCount, final indexProducer producer) {
this.entries = new ArrayBlockingQueue<wikiraw>(bufferCount);
this.producer = producer;
count = 0;
this.count = 0;
}
public void consume(wikiraw b) {
public void consume(final wikiraw b) {
try {
entries.put(b);
} catch (InterruptedException e) {
this.entries.put(b);
} catch (final InterruptedException e) {
Log.logException(e);
}
}
@ -417,23 +417,23 @@ public class MediawikiImporter extends Thread implements Importer {
wikiraw c;
try {
while(true) {
c = entries.take();
c = this.entries.take();
if (c == poison) {
Log.logInfo("WIKITRANSLATION", "consumer / got poison");
break;
}
try {
r = new wikisourcerecord(c.b, c.start, c.end);
producer.consume(r);
this.producer.consume(r);
Log.logInfo("WIKITRANSLATION", "consumer / record start: " + r.start + ", title : " + r.title);
count++;
} catch (RuntimeException e) {}
this.count++;
} catch (final RuntimeException e) {}
}
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
entries.clear();
return Integer.valueOf(count);
this.entries.clear();
return Integer.valueOf(this.count);
}
}
@ -441,7 +441,7 @@ public class MediawikiImporter extends Thread implements Importer {
private static class wikiraw {
public long start, end;
public byte[] b;
public wikiraw(byte[] b, long start, long end) {
public wikiraw(final byte[] b, final long start, final long end) {
this.b = b;
this.start = start;
this.end = end;
@ -451,17 +451,17 @@ public class MediawikiImporter extends Thread implements Importer {
public static class wikisourcerecord {
public long start, end;
public String title;
public wikisourcerecord(String title, long start, long end) {
public wikisourcerecord(final String title, final long start, final long end) {
this.title = title;
this.start = start;
this.end = end;
}
public wikisourcerecord(byte[] chunk, long start, long end) {
public wikisourcerecord(final byte[] chunk, final long start, final long end) {
String s;
s = UTF8.String(chunk);
int t0 = s.indexOf("<title>");
final int t0 = s.indexOf("<title>");
if (t0 >= 0) {
int t1 = s.indexOf("</title>", t0);
final int t1 = s.indexOf("</title>", t0);
if (t1 >= 0) {
this.title = s.substring(t0 + 7, t1);
} else {
@ -478,7 +478,7 @@ public class MediawikiImporter extends Thread implements Importer {
public wikiparserrecord newRecord() {
return new wikiparserrecord(null, null, null, null);
}
public wikiparserrecord newRecord(String hostport, String urlStub, String title, StringBuilder sb) {
public wikiparserrecord newRecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) {
return new wikiparserrecord(hostport, urlStub, title, sb);
}
@ -487,7 +487,7 @@ public class MediawikiImporter extends Thread implements Importer {
String source, html, hostport, urlStub;
DigestURI url;
Document document;
public wikiparserrecord(String hostport, String urlStub, String title, StringBuilder sb) {
public wikiparserrecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) {
this.title = title;
this.hostport = hostport;
this.urlStub = urlStub;
@ -495,26 +495,26 @@ public class MediawikiImporter extends Thread implements Importer {
}
public void genHTML() throws IOException {
try {
WikiParser wparser = new WikiCode();
html = wparser.transform(hostport, source);
} catch (Exception e) {
final WikiParser wparser = new WikiCode();
this.html = wparser.transform(this.hostport, this.source);
} catch (final Exception e) {
Log.logException(e);
throw new IOException(e.getMessage());
}
}
public void genDocument() throws Parser.Failure {
try {
url = new DigestURI(urlStub + title);
Document[] parsed = TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html));
document = Document.mergeDocuments(url, "text/html", parsed);
this.url = new DigestURI(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html), false);
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
document.setTitle(title);
} catch (MalformedURLException e1) {
this.document.setTitle(this.title);
} catch (final MalformedURLException e1) {
Log.logException(e1);
}
}
public void writeXML(OutputStreamWriter os) throws IOException {
document.writeXML(os, new Date());
public void writeXML(final OutputStreamWriter os) throws IOException {
this.document.writeXML(os, new Date());
}
}
@ -524,22 +524,22 @@ public class MediawikiImporter extends Thread implements Importer {
private long seekpos;
private ByteBuffer bb;
public PositionAwareReader(File dumpFile) throws FileNotFoundException {
public PositionAwareReader(final File dumpFile) throws FileNotFoundException {
this.is = new BufferedInputStream(new FileInputStream(dumpFile), 64 *1024);
this.seekpos = 0;
this.bb = new ByteBuffer();
}
public void resetBuffer() {
if (bb.length() > 10 * 1024) bb = new ByteBuffer(); else bb.clear();
if (this.bb.length() > 10 * 1024) this.bb = new ByteBuffer(); else this.bb.clear();
}
public boolean seek(byte[] pattern) throws IOException {
public boolean seek(final byte[] pattern) throws IOException {
int pp = 0;
int c;
while ((c = is.read()) >= 0) {
seekpos++;
bb.append(c);
while ((c = this.is.read()) >= 0) {
this.seekpos++;
this.bb.append(c);
if (pattern[pp] == c) pp++; else pp = 0;
if (pp == pattern.length) return true;
}
@ -547,45 +547,45 @@ public class MediawikiImporter extends Thread implements Importer {
}
public long pos() {
return seekpos;
return this.seekpos;
}
public byte[] bytes() {
return bb.getBytes();
return this.bb.getBytes();
}
public void close() {
try {
is.close();
} catch (IOException e) {
this.is.close();
} catch (final IOException e) {
Log.logException(e);
}
}
}
public static byte[] read(File f, long start, int len) {
byte[] b = new byte[len];
public static byte[] read(final File f, final long start, final int len) {
final byte[] b = new byte[len];
RandomAccessFile raf = null;
try {
raf = new RandomAccessFile(f, "r");
raf.seek(start);
raf.read(b);
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
return null;
} finally {
if (raf != null) try {
raf.close();
try{raf.getChannel().close();} catch (IOException e) {}
} catch (IOException e) { }
try{raf.getChannel().close();} catch (final IOException e) {}
} catch (final IOException e) { }
}
return b;
}
public static wikisourcerecord find(String title, File f) throws IOException {
PositionAwareReader in = new PositionAwareReader(f);
public static wikisourcerecord find(final String title, final File f) throws IOException {
final PositionAwareReader in = new PositionAwareReader(f);
long start;
String m = "<title>" + title + "</title>";
final String m = "<title>" + title + "</title>";
String s;
while (in.seek(UTF8.getBytes("<page "))) {
start = in.pos() - 6;
@ -607,7 +607,7 @@ public class MediawikiImporter extends Thread implements Importer {
p += 8;
q = s.indexOf('"', p + 1);
if (q < 0) return null;
int length = Integer.parseInt(s.substring(p, q));
final int length = Integer.parseInt(s.substring(p, q));
//Log.logInfo("WIKITRANSLATION", "start = " + start + ", length = " + length);
return new wikisourcerecord(title, start, start + length);
}
@ -620,7 +620,7 @@ public class MediawikiImporter extends Thread implements Importer {
private final BlockingQueue<wikiparserrecord> in, out;
private final wikiparserrecord poison;
public convertConsumer(BlockingQueue<wikiparserrecord> in, BlockingQueue<wikiparserrecord> out, wikiparserrecord poison) {
public convertConsumer(final BlockingQueue<wikiparserrecord> in, final BlockingQueue<wikiparserrecord> out, final wikiparserrecord poison) {
this.poison = poison;
this.in = in;
this.out = out;
@ -630,25 +630,25 @@ public class MediawikiImporter extends Thread implements Importer {
wikiparserrecord record;
try {
while(true) {
record = in.take();
if (record == poison) {
record = this.in.take();
if (record == this.poison) {
Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison");
break;
}
try {
record.genHTML();
record.genDocument();
out.put(record);
} catch (RuntimeException e) {
this.out.put(record);
} catch (final RuntimeException e) {
Log.logException(e);
} catch (Parser.Failure e) {
} catch (final Parser.Failure e) {
Log.logException(e);
} catch (IOException e) {
} catch (final IOException e) {
// TODO Auto-generated catch block
Log.logException(e);
}
}
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
Log.logInfo("WIKITRANSLATION", "*** convertConsumer has terminated");
@ -668,10 +668,10 @@ public class MediawikiImporter extends Thread implements Importer {
private String outputfilename;
public convertWriter(
BlockingQueue<wikiparserrecord> in,
wikiparserrecord poison,
File targetdir,
String targetstub) {
final BlockingQueue<wikiparserrecord> in,
final wikiparserrecord poison,
final File targetdir,
final String targetstub) {
this.poison = poison;
this.in = in;
this.osw = null;
@ -686,48 +686,48 @@ public class MediawikiImporter extends Thread implements Importer {
wikiparserrecord record;
try {
while(true) {
record = in.take();
if (record == poison) {
record = this.in.take();
if (record == this.poison) {
Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison");
break;
}
if (osw == null) {
if (this.osw == null) {
// start writing a new file
this.outputfilename = targetstub + "." + fc + ".xml.prt";
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n");
this.outputfilename = this.targetstub + "." + this.fc + ".xml.prt";
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(this.targetdir, this.outputfilename))), "UTF-8");
this.osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n");
}
Log.logInfo("WIKITRANSLATION", "[CONSUME] Title: " + record.title);
record.document.writeXML(osw, new Date());
rc++;
if (rc >= 10000) {
osw.write("</surrogates>\n");
osw.close();
String finalfilename = targetstub + "." + fc + ".xml";
new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename));
rc = 0;
fc++;
outputfilename = targetstub + "." + fc + ".xml.prt";
osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n");
}
}
} catch (InterruptedException e) {
record.document.writeXML(this.osw, new Date());
this.rc++;
if (this.rc >= 10000) {
this.osw.write("</surrogates>\n");
this.osw.close();
final String finalfilename = this.targetstub + "." + this.fc + ".xml";
new File(this.targetdir, this.outputfilename).renameTo(new File(this.targetdir, finalfilename));
this.rc = 0;
this.fc++;
this.outputfilename = this.targetstub + "." + this.fc + ".xml.prt";
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(this.targetdir, this.outputfilename))), "UTF-8");
this.osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n");
}
}
} catch (final InterruptedException e) {
Log.logException(e);
} catch (UnsupportedEncodingException e) {
} catch (final UnsupportedEncodingException e) {
Log.logException(e);
} catch (FileNotFoundException e) {
} catch (final FileNotFoundException e) {
Log.logException(e);
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
} finally {
try {
osw.write(SurrogateReader.SURROGATES_MAIN_ELEMENT_CLOSE + "\n");
osw.close();
String finalfilename = targetstub + "." + fc + ".xml";
new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename));
} catch (IOException e) {
this.osw.write(SurrogateReader.SURROGATES_MAIN_ELEMENT_CLOSE + "\n");
this.osw.close();
final String finalfilename = this.targetstub + "." + this.fc + ".xml";
new File(this.targetdir, this.outputfilename).renameTo(new File(this.targetdir, finalfilename));
} catch (final IOException e) {
Log.logException(e);
}
}
@ -737,7 +737,7 @@ public class MediawikiImporter extends Thread implements Importer {
}
public static void main(String[] s) {
public static void main(final String[] s) {
if (s.length == 0) {
Log.logInfo("WIKITRANSLATION", "usage:");
Log.logInfo("WIKITRANSLATION", " -index <wikipedia-dump>");
@ -751,15 +751,15 @@ public class MediawikiImporter extends Thread implements Importer {
// java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
if (s[0].equals("-convert") && s.length > 2) {
File sourcefile = new File(s[1]);
File targetdir = new File(s[2]);
final File sourcefile = new File(s[1]);
final File targetdir = new File(s[2]);
//String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
//String language = urlStub.substring(7,9);
try {
MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir);
final MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir);
mi.start();
mi.join();
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
}
@ -767,26 +767,26 @@ public class MediawikiImporter extends Thread implements Importer {
if (s[0].equals("-index")) {
try {
createIndex(new File(s[1]));
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
}
if (s[0].equals("-read")) {
long start = Integer.parseInt(s[1]);
int len = Integer.parseInt(s[2]);
final long start = Integer.parseInt(s[1]);
final int len = Integer.parseInt(s[2]);
System.out.println(UTF8.String(read(new File(s[3]), start, len)));
}
if (s[0].equals("-find")) {
try {
wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
if (w == null) {
Log.logInfo("WIKITRANSLATION", "not found");
} else {
System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start))));
}
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}

@ -45,14 +45,14 @@ public class bzipParser extends AbstractParser implements Parser {
public bzipParser() {
super("Bzip 2 UNIX Compressed File Parser");
SUPPORTED_EXTENSIONS.add("bz2");
SUPPORTED_EXTENSIONS.add("tbz");
SUPPORTED_EXTENSIONS.add("tbz2");
SUPPORTED_MIME_TYPES.add("application/x-bzip2");
SUPPORTED_MIME_TYPES.add("application/bzip2");
SUPPORTED_MIME_TYPES.add("application/x-bz2");
SUPPORTED_MIME_TYPES.add("application/x-bzip");
SUPPORTED_MIME_TYPES.add("application/x-stuffit");
this.SUPPORTED_EXTENSIONS.add("bz2");
this.SUPPORTED_EXTENSIONS.add("tbz");
this.SUPPORTED_EXTENSIONS.add("tbz2");
this.SUPPORTED_MIME_TYPES.add("application/x-bzip2");
this.SUPPORTED_MIME_TYPES.add("application/bzip2");
this.SUPPORTED_MIME_TYPES.add("application/x-bz2");
this.SUPPORTED_MIME_TYPES.add("application/x-bzip");
this.SUPPORTED_MIME_TYPES.add("application/x-stuffit");
}
public Document[] parse(final MultiProtocolURI location, final String mimeType,
@ -93,7 +93,7 @@ public class bzipParser extends AbstractParser implements Parser {
out.close();
// creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, tempFile);
docs = TextParser.parseSource(location, null, null, tempFile, false);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -44,14 +44,14 @@ public class gzipParser extends AbstractParser implements Parser {
public gzipParser() {
super("GNU Zip Compressed Archive Parser");
SUPPORTED_EXTENSIONS.add("gz");
SUPPORTED_EXTENSIONS.add("tgz");
SUPPORTED_MIME_TYPES.add("application/x-gzip");
SUPPORTED_MIME_TYPES.add("application/gzip");
SUPPORTED_MIME_TYPES.add("application/x-gunzip");
SUPPORTED_MIME_TYPES.add("application/gzipped");
SUPPORTED_MIME_TYPES.add("application/gzip-compressed");
SUPPORTED_MIME_TYPES.add("gzip/document");
this.SUPPORTED_EXTENSIONS.add("gz");
this.SUPPORTED_EXTENSIONS.add("tgz");
this.SUPPORTED_MIME_TYPES.add("application/x-gzip");
this.SUPPORTED_MIME_TYPES.add("application/gzip");
this.SUPPORTED_MIME_TYPES.add("application/x-gunzip");
this.SUPPORTED_MIME_TYPES.add("application/gzipped");
this.SUPPORTED_MIME_TYPES.add("application/gzip-compressed");
this.SUPPORTED_MIME_TYPES.add("gzip/document");
}
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
@ -78,7 +78,7 @@ public class gzipParser extends AbstractParser implements Parser {
out.close();
// creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location,null,null,tempFile);
docs = TextParser.parseSource(location,null,null,tempFile, false);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -32,20 +32,15 @@ import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.document.AbstractParser;
import net.yacy.document.Classification;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.util.FileUtils;
@ -96,79 +91,15 @@ public class htmlParser extends AbstractParser implements Parser {
try {
// first get a document from the parsed html
ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream);
Document document = transformScraper(location, mimeType, documentCharset, scraper);
// then produce virtual documents for each of the link that is contained in the document!
ArrayList<Document> docs = new ArrayList<Document>();
docs.add(document);
for (Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
addLinkDocs(docs, "application", link.getKey(), link.getValue(), scraper);
}
for (Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
addLinkDocs(docs, "audio", link.getKey(), link.getValue(), scraper);
}
for (Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
addLinkDocs(docs, "video", link.getKey(), link.getValue(), scraper);
}
for (Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
addImageDocs(docs, link.getValue());
}
final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream);
final Document document = transformScraper(location, mimeType, documentCharset, scraper);
// finally return the list of documents
return docs.toArray(new Document[docs.size()]);
return new Document[]{document};
} catch (final IOException e) {
throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
}
}
private final static void addLinkDocs(ArrayList<Document> docs, String type, MultiProtocolURI uri, String descr, ContentScraper scraper) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
final Document doc = new Document(
uri,
Classification.ext2mime(uri.getFileExtension()),
"UTF-8",
null,
scraper.getContentLanguages(),
null,
descr,
"",
"",
new String[]{descr},
type,
0.0f, 0.0f,
uri.toNormalform(false, false),
null,
null,
null,
false);
docs.add(doc);
}
private final static void addImageDocs(ArrayList<Document> docs, ImageEntry img) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
final Document doc = new Document(
img.url(),
Classification.ext2mime(img.url().getFileExtension()),
"UTF-8",
null,
null,
null,
img.alt(),
"",
"",
new String[]{img.alt()},
"image",
0.0f, 0.0f,
img.url().toNormalform(false, false),
null,
null,
null,
false);
docs.add(doc);
}
/**
* the transformScraper method transforms a scraper object into a document object
* @param location

@ -40,7 +40,6 @@ import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream;
import SevenZip.Archive.IInArchive;
@ -51,8 +50,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
public sevenzipParser() {
super("7zip Archive Parser");
SUPPORTED_EXTENSIONS.add("7z");
SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
this.SUPPORTED_EXTENSIONS.add("7z");
this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
}
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException {
@ -165,9 +164,9 @@ public class sevenzipParser extends AbstractParser implements Parser {
Document[] theDocs;
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
final MultiProtocolURI url = MultiProtocolURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
final MultiProtocolURI url = MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray(), false);
this.doc.addSubDocuments(theDocs);
}

@ -47,11 +47,11 @@ public class tarParser extends AbstractParser implements Parser {
public tarParser() {
super("Tape Archive File Parser");
SUPPORTED_EXTENSIONS.add("tar");
SUPPORTED_MIME_TYPES.add("application/x-tar");
SUPPORTED_MIME_TYPES.add("application/tar");
SUPPORTED_MIME_TYPES.add("applicaton/x-gtar");
SUPPORTED_MIME_TYPES.add("multipart/x-tar");
this.SUPPORTED_EXTENSIONS.add("tar");
this.SUPPORTED_MIME_TYPES.add("application/x-tar");
this.SUPPORTED_MIME_TYPES.add("application/tar");
this.SUPPORTED_MIME_TYPES.add("applicaton/x-gtar");
this.SUPPORTED_MIME_TYPES.add("multipart/x-tar");
}
public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException {
@ -62,7 +62,7 @@ public class tarParser extends AbstractParser implements Parser {
if (ext.equals("gz") || ext.equals("tgz")) {
try {
source = new GZIPInputStream(source);
} catch (IOException e) {
} catch (final IOException e) {
throw new Parser.Failure("tar parser: " + e.getMessage(), url);
}
}
@ -83,16 +83,16 @@ public class tarParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp);
subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp, false);
if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) {
log.logWarning("tar parser entry " + name + ": " + e.getMessage());
this.log.logWarning("tar parser entry " + name + ": " + e.getMessage());
} finally {
if (tmp != null) FileUtils.deletedelete(tmp);
}
} catch (IOException e) {
log.logWarning("tar parser:" + e.getMessage());
} catch (final IOException e) {
this.log.logWarning("tar parser:" + e.getMessage());
break;
}
}

@ -87,7 +87,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize());
final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(virtualURL, mime, null, tmp);
docs = TextParser.parseSource(virtualURL, mime, null, tmp, false);
if (docs == null) continue;
for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) {

@ -367,7 +367,7 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.mime());
if (supportError != null) throw new IOException("no parser support: " + supportError);
try {
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent());
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent(), false);
if (documents == null) throw new IOException("document == null");
} catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage());

@ -1,28 +1,29 @@
package de.anomic.document;
import static org.junit.Assert.*;
import static org.junit.matchers.JUnitMatchers.*;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import org.junit.Test;
import static org.junit.Assert.assertThat;
import static org.junit.matchers.JUnitMatchers.containsString;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import org.junit.Test;
public class ParserTest {
@Test public void testParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
String[][] testFiles = new String[][] {
final String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""},
@ -34,25 +35,25 @@ public class ParserTest {
};
for (int i=0; i < testFiles.length; i++) {
String filename = "test/parsertest/" + testFiles[i][0];
File file = new File(filename);
String mimetype = testFiles[i][1];
DigestURI url = new DigestURI("http://localhost/"+filename);
for (final String[] testFile : testFiles) {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file));
for (Document doc: docs) {
Reader content = new InputStreamReader(doc.getText(), doc.getCharset());
StringBuilder str = new StringBuilder();
final Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file), true);
for (final Document doc: docs) {
final Reader content = new InputStreamReader(doc.getText(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFiles[i][2]));
assertThat(doc.dc_creator(), containsString(testFiles[i][3]));
assertThat(doc.dc_description(), containsString(testFiles[i][4]));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4]));
}
}
}

Loading…
Cancel
Save