diff --git a/defaults/yacy.init b/defaults/yacy.init index 44f45b7d1..eed964a98 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -704,6 +704,10 @@ crawler.file.maxFileSize=100000000 # maximum number of crawler threads crawler.MaxActiveThreads = 200 +# flag: consider all embedded image/audio/video document links +# from all crawled documents as its own document +crawler.embedLinksAsDocuments = true + # maximum size of indexing queue indexer.slots = 100 diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 8fd3349d2..d36851e36 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -43,7 +43,7 @@ import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.ResultURLs.EventOrigin; public class Response { - + // doctypes: public static final char DT_PDFPS = 'p'; public static final char DT_TEXT = 't'; @@ -65,7 +65,7 @@ public class Response { private final CrawlProfile profile; private byte[] content; private int status; // tracker indexing status, see status defs below - + // doctype calculation public static char docType(final DigestURI url) { final String path = url.getPath().toLowerCase(); @@ -136,14 +136,14 @@ public class Response { //zip = application/zip return doctype; } - + public static final int QUEUE_STATE_FRESH = 0; public static final int QUEUE_STATE_PARSING = 1; public static final int QUEUE_STATE_CONDENSING = 2; public static final int QUEUE_STATE_STRUCTUREANALYSIS = 3; public static final int QUEUE_STATE_INDEXSTORAGE = 4; public static final int QUEUE_STATE_FINISHED = 5; - + public Response( final Request request, final RequestHeader requestHeader, @@ -160,7 +160,7 @@ public class Response { this.status = QUEUE_STATE_FRESH; this.content = content; } - + public Response(final Request request, final CrawlProfile profile) { this.request = request; // request and response headers may be zero in case that we process surrogates @@ -172,7 +172,7 @@ public class Response { this.status = QUEUE_STATE_FRESH; this.content = request.url().toTokens().getBytes(); } - + public Response( final Request request, final RequestHeader requestHeader, @@ -185,15 +185,15 @@ public class Response { public void updateStatus(final int newStatus) { this.status = newStatus; } - + public ResponseHeader getResponseHeader() { return this.responseHeader; } - + public int getStatus() { return this.status; } - + public String name() { // the anchor name; can be either the text inside the anchor tag or the // page description after loading of the page @@ -203,7 +203,7 @@ public class Response { public DigestURI url() { return this.request.url(); } - + public char docType() { char doctype = docType(getMimeType()); if (doctype == DT_UNKNOWN) doctype = docType(url()); @@ -212,21 +212,21 @@ public class Response { public Date lastModified() { Date docDate = null; - - if (responseHeader != null) { - docDate = responseHeader.lastModified(); - if (docDate == null) docDate = responseHeader.date(); + + if (this.responseHeader != null) { + docDate = this.responseHeader.lastModified(); + if (docDate == null) docDate = this.responseHeader.date(); } - if (docDate == null && request != null) docDate = request.appdate(); - if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime()); - + if (docDate == null && this.request != null) docDate = this.request.appdate(); + if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime()); + return docDate; } - + public String language() { // please avoid this method if a condenser document is available, because the condenser has a built-in language detection // this here is only a guess using the TLD - return this.url().language(); + return url().language(); } public CrawlProfile profile() { @@ -272,9 +272,9 @@ public class Response { */ public String shallStoreCacheForProxy() { - String crawlerReason = shallStoreCacheForCrawler(); + final String crawlerReason = shallStoreCacheForCrawler(); if (crawlerReason != null) return crawlerReason; - + // check profile (disabled: we will check this in the plasmaSwitchboard) // if (!this.profile.storeHTCache()) { return "storage_not_wanted"; } @@ -285,19 +285,19 @@ public class Response { // -CGI access in request // CGI access makes the page very individual, and therefore not usable // in caches - if (this.url().isPOST() && this.profile != null && !this.profile.crawlingQ()) { + if (url().isPOST() && this.profile != null && !this.profile.crawlingQ()) { return "dynamic_post"; } - - if (this.url().isCGI()) { + + if (url().isCGI()) { return "dynamic_cgi"; } - - if (this.url().isLocal()) { + + if (url().isLocal()) { return "local_URL_no_cache_needed"; } - - if (responseHeader != null) { + + if (this.responseHeader != null) { // -if-modified-since in request // we do not care about if-modified-since, because this case only occurres if the @@ -315,7 +315,7 @@ public class Response { // -pragma in response // if we have a pragma non-cache, we don't cache. usually if this is wanted from // the server, it makes sense - String cacheControl = responseHeader.get(HeaderFramework.PRAGMA); + String cacheControl = this.responseHeader.get(HeaderFramework.PRAGMA); if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; } // -expires in response @@ -324,12 +324,12 @@ public class Response { // -cache-control in response // the cache-control has many value options. - cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL); + cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL); if (cacheControl != null) { cacheControl = cacheControl.trim().toUpperCase(); if (cacheControl.startsWith("MAX-AGE=")) { // we need also the load date - final Date date = responseHeader.date(); + final Date date = this.responseHeader.date(); if (date == null) return "stale_no_date_given_in_response"; try { final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live @@ -349,35 +349,35 @@ public class Response { public String shallStoreCacheForCrawler() { // check storage size: all files will be handled in RAM before storage, so they must not exceed // a given size, which we consider as 1MB - if (this.size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + this.size(); - + if (size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + size(); + // check status code if (!validResponseStatus()) { return "bad_status_" + this.responseStatus; } - if (requestHeader != null) { + if (this.requestHeader != null) { // -authorization cases in request // authorization makes pages very individual, and therefore we cannot use the // content in the cache - if (requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return "personalized"; } + if (this.requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return "personalized"; } // -ranges in request and response // we do not cache partial content - if (requestHeader.containsKey(HeaderFramework.RANGE)) { return "partial_request"; } + if (this.requestHeader.containsKey(HeaderFramework.RANGE)) { return "partial_request"; } } - - if (responseHeader != null) { + + if (this.responseHeader != null) { // -ranges in request and response - // we do not cache partial content - if (responseHeader.containsKey(HeaderFramework.CONTENT_RANGE)) { return "partial_response"; } + // we do not cache partial content + if (this.responseHeader.containsKey(HeaderFramework.CONTENT_RANGE)) { return "partial_response"; } } return null; } - + /** * decide upon header information if a specific file should be taken from * the cache or not - * + * * @return whether the file should be taken from the cache */ public boolean isFreshForProxy() { @@ -385,27 +385,27 @@ public class Response { // -CGI access in request // CGI access makes the page very individual, and therefore not usable // in caches - if (this.url().isPOST()) { + if (url().isPOST()) { return false; } - if (this.url().isCGI()) { + if (url().isCGI()) { return false; } String cacheControl; - if (requestHeader != null) { + if (this.requestHeader != null) { // -authorization cases in request - if (requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return false; } + if (this.requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return false; } // -ranges in request // we do not cache partial content - if (requestHeader.containsKey(HeaderFramework.RANGE)) { return false; } + if (this.requestHeader.containsKey(HeaderFramework.RANGE)) { return false; } // if the client requests a un-cached copy of the resource ... - cacheControl = requestHeader.get(HeaderFramework.PRAGMA); + cacheControl = this.requestHeader.get(HeaderFramework.PRAGMA); if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; } - cacheControl = requestHeader.get(HeaderFramework.CACHE_CONTROL); + cacheControl = this.requestHeader.get(HeaderFramework.CACHE_CONTROL); if (cacheControl != null) { cacheControl = cacheControl.trim().toUpperCase(); if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; } @@ -414,14 +414,14 @@ public class Response { // -if-modified-since in request // The entity has to be transferred only if it has // been modified since the date given by the If-Modified-Since header. - if (requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) { + if (this.requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) { // checking this makes only sense if the cached response contains // a Last-Modified field. If the field does not exist, we go the safe way - if (!responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; } + if (!this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; } // parse date Date d1, d2; - d2 = responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); } - d1 = requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); } + d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); } + d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); } // finally, we shall treat the cache as stale if the modification time is after the if-.. time if (d2.after(d1)) { return false; } } @@ -433,48 +433,48 @@ public class Response { // but we think that pictures can still be considered as fresh // -set-cookie in cached response // this is a similar case as for COOKIE. - if (requestHeader.containsKey(RequestHeader.COOKIE) || - responseHeader.containsKey(HeaderFramework.SET_COOKIE) || - responseHeader.containsKey(HeaderFramework.SET_COOKIE2)) { + if (this.requestHeader.containsKey(RequestHeader.COOKIE) || + this.responseHeader.containsKey(HeaderFramework.SET_COOKIE) || + this.responseHeader.containsKey(HeaderFramework.SET_COOKIE2)) { return false; // too strong } } } - if (responseHeader != null) { + if (this.responseHeader != null) { // -pragma in cached response // logically, we would not need to care about no-cache pragmas in cached response headers, // because they cannot exist since they are not written to the cache. // So this IF should always fail.. - cacheControl = responseHeader.get(HeaderFramework.PRAGMA); + cacheControl = this.responseHeader.get(HeaderFramework.PRAGMA); if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; } - + // see for documentation also: // http://www.web-caching.com/cacheability.html // http://vancouver-webpages.com/CacheNow/ - + // look for freshnes information // if we don't have any freshnes indication, we treat the file as stale. // no handle for freshness control: - + // -expires in cached response // the expires value gives us a very easy hint when the cache is stale - final Date expires = responseHeader.expires(); + final Date expires = this.responseHeader.expires(); if (expires != null) { // System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url); if (expires.before(new Date(GenericFormatter.correctedUTCTime()))) { return false; } } - final Date lastModified = responseHeader.lastModified(); - cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL); + final Date lastModified = this.responseHeader.lastModified(); + cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL); if (cacheControl == null && lastModified == null && expires == null) { return false; } - + // -lastModified in cached response // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read // of the file and the last modified date as the age of the file. If we consider the file as // middel-aged then, the maximum TTL would be cache-creation plus age. // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache // file may only be treated as fresh for one more month, not more. - Date date = responseHeader.date(); + Date date = this.responseHeader.date(); if (lastModified != null) { if (date == null) { date = new Date(GenericFormatter.correctedUTCTime()); } final long age = date.getTime() - lastModified.getTime(); @@ -484,7 +484,7 @@ public class Response { // therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10 if (GenericFormatter.correctedUTCTime() - date.getTime() > age / 10) { return false; } } - + // -cache-control in cached response // the cache-control has many value options. if (cacheControl != null) { @@ -510,17 +510,17 @@ public class Response { } } } - + return true; } - + /** * decide upon header information if a specific file should be indexed * this method returns null if the answer is 'YES'! * if the answer is 'NO' (do not index), it returns a string with the reason * to reject the crawling demand in clear text - * + * * This function is used by plasmaSwitchboard#processResourceStack */ public final String shallIndexCacheForProxy() { @@ -530,7 +530,7 @@ public class Response { // check profile if (!profile().indexText() && !profile().indexMedia()) { - return "indexing not allowed - indexText and indexMedia not set (for proxy = " + profile.name()+ ")"; + return "indexing not allowed - indexText and indexMedia not set (for proxy = " + this.profile.name()+ ")"; } // -CGI access in request @@ -556,7 +556,7 @@ public class Response { return "Media_Content_(forbidden)"; } */ - + // -cookies in request // unfortunately, we cannot index pages which have been requested with a cookie // because the returned content may be special for the client @@ -565,19 +565,19 @@ public class Response { return "Dynamic_(Requested_With_Cookie)"; } - if (responseHeader != null) { + if (this.responseHeader != null) { // -set-cookie in response // the set-cookie from the server does not indicate that the content is special - // thus we do not care about it here for indexing - + // thus we do not care about it here for indexing + // a picture cannot be indexed - final String mimeType = responseHeader.mime(); + final String mimeType = this.responseHeader.mime(); /* if (Classification.isPictureMime(mimeType)) { return "Media_Content_(Picture)"; } */ - String parserError = TextParser.supportsMime(mimeType); + final String parserError = TextParser.supportsMime(mimeType); if (parserError != null) { return "Media_Content, no parser: " + parserError; } @@ -585,9 +585,9 @@ public class Response { // -if-modified-since in request // if the page is fresh at the very moment we can index it final Date ifModifiedSince = this.requestHeader.ifModifiedSince(); - if ((ifModifiedSince != null) && (responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) { + if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) { // parse date - Date d = responseHeader.lastModified(); + Date d = this.responseHeader.lastModified(); if (d == null) { d = new Date(GenericFormatter.correctedUTCTime()); } @@ -599,8 +599,8 @@ public class Response { } // -pragma in cached response - if (responseHeader.containsKey(HeaderFramework.PRAGMA) && - (responseHeader.get(HeaderFramework.PRAGMA)).toUpperCase().equals("NO-CACHE")) { + if (this.responseHeader.containsKey(HeaderFramework.PRAGMA) && + (this.responseHeader.get(HeaderFramework.PRAGMA)).toUpperCase().equals("NO-CACHE")) { return "Denied_(pragma_no_cache)"; } @@ -613,7 +613,7 @@ public class Response { // the expires value gives us a very easy hint when the cache is stale // sometimes, the expires date is set to the past to prevent that a page is cached // we use that information to see if we should index it - final Date expires = responseHeader.expires(); + final Date expires = this.responseHeader.expires(); if (expires != null && expires.before(new Date(GenericFormatter.correctedUTCTime()))) { return "Stale_(Expired)"; } @@ -624,7 +624,7 @@ public class Response { // -cache-control in cached response // the cache-control has many value options. - String cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL); + String cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL); if (cacheControl != null) { cacheControl = cacheControl.trim().toUpperCase(); /* we have the following cases for cache-control: @@ -641,7 +641,7 @@ public class Response { // // ok, do nothing } else if (cacheControl.startsWith("MAX-AGE=")) { // we need also the load date - final Date date = responseHeader.date(); + final Date date = this.responseHeader.date(); if (date == null) { return "Stale_(no_date_given_in_response)"; } @@ -675,7 +675,7 @@ public class Response { // check profile if (!profile().indexText() && !profile().indexMedia()) { - return "indexing not allowed - indexText and indexMedia not set (for crawler = " + profile.name() + ")"; + return "indexing not allowed - indexText and indexMedia not set (for crawler = " + this.profile.name() + ")"; } // -CGI access in request @@ -692,9 +692,9 @@ public class Response { // we checked that in shallStoreCache // check if document can be indexed - if (responseHeader != null) { - final String mimeType = responseHeader.mime(); - String parserError = TextParser.supportsMime(mimeType); + if (this.responseHeader != null) { + final String mimeType = this.responseHeader.mime(); + final String parserError = TextParser.supportsMime(mimeType); if (parserError != null && TextParser.supportsExtension(url()) != null) return "no parser available: " + parserError; } /* @@ -703,7 +703,7 @@ public class Response { return "Media_Content_(forbidden)"; } */ - + // -if-modified-since in request // if the page is fresh at the very moment we can index it // -> this does not apply for the crawler @@ -739,36 +739,36 @@ public class Response { return null; } - + public String getMimeType() { - if (responseHeader == null) return null; - - String mimeType = responseHeader.mime(); + if (this.responseHeader == null) return null; + + String mimeType = this.responseHeader.mime(); mimeType = mimeType.trim().toLowerCase(); - + final int pos = mimeType.indexOf(';'); - return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); + return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); } - + public String getCharacterEncoding() { - if (responseHeader == null) return null; - return responseHeader.getCharacterEncoding(); + if (this.responseHeader == null) return null; + return this.responseHeader.getCharacterEncoding(); } - + public DigestURI referrerURL() { - if (requestHeader == null) return null; + if (this.requestHeader == null) return null; try { - String r = requestHeader.get(RequestHeader.REFERER, null); + final String r = this.requestHeader.get(RequestHeader.REFERER, null); if (r == null) return null; return new DigestURI(r); } catch (final Exception e) { return null; } } - + public byte[] referrerHash() { - if (requestHeader == null) return null; - String u = requestHeader.get(RequestHeader.REFERER, ""); + if (this.requestHeader == null) return null; + final String u = this.requestHeader.get(RequestHeader.REFERER, ""); if (u == null || u.length() == 0) return null; try { return new DigestURI(u).hash(); @@ -776,27 +776,27 @@ public class Response { return null; } } - + public boolean validResponseStatus() { - return (responseStatus == null) ? false : responseStatus.startsWith("200") || responseStatus.startsWith("203"); + return (this.responseStatus == null) ? false : this.responseStatus.startsWith("200") || this.responseStatus.startsWith("203"); } public Date ifModifiedSince() { - return (requestHeader == null) ? null : requestHeader.ifModifiedSince(); + return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince(); } public boolean requestWithCookie() { - return (requestHeader == null) ? false : requestHeader.containsKey(RequestHeader.COOKIE); + return (this.requestHeader == null) ? false : this.requestHeader.containsKey(RequestHeader.COOKIE); } public boolean requestProhibitsIndexing() { - return (requestHeader == null) - ? false - : requestHeader.containsKey(HeaderFramework.X_YACY_INDEX_CONTROL) && - (requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"); + return (this.requestHeader == null) + ? false + : this.requestHeader.containsKey(HeaderFramework.X_YACY_INDEX_CONTROL) && + (this.requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"); } - - public EventOrigin processCase(String mySeedHash) { + + public EventOrigin processCase(final String mySeedHash) { // we must distinguish the following cases: resource-load was initiated by // 1) global crawling: the index is extern, not here (not possible here) // 2) result of search queries, some indexes are here (not possible here) @@ -818,13 +818,13 @@ public class Response { } return processCase; } - + public Document[] parse() throws Parser.Failure { - String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); + final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); try { - return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content); - } catch (Exception e) { + return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content, false); + } catch (final Exception e) { return null; } diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java index 7394e2c85..cade93eb6 100644 --- a/source/de/anomic/search/DocumentIndex.java +++ b/source/de/anomic/search/DocumentIndex.java @@ -35,7 +35,6 @@ import java.util.Date; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; - import net.yacy.cora.document.UTF8; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -53,26 +52,26 @@ import net.yacy.kelondro.logging.Log; * */ public class DocumentIndex extends Segment { - + private static final RankingProfile textRankingDefault = new RankingProfile(ContentDomain.TEXT); //private Bitfield zeroConstraint = new Bitfield(4); - + private static DigestURI poison; static { try { poison = new DigestURI("file://."); - } catch (MalformedURLException e) {} + } catch (final MalformedURLException e) {} } BlockingQueue queue; // a queue of document ID's - private Worker[] worker; + private final Worker[] worker; CallbackListener callback; static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup"); - - - public DocumentIndex(final File segmentPath, CallbackListener callback, int cachesize) throws IOException { + + + public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) throws IOException { super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false); - int cores = Runtime.getRuntime().availableProcessors() + 1; + final int cores = Runtime.getRuntime().availableProcessors() + 1; this.callback = callback; this.queue = new LinkedBlockingQueue(cores * 300); this.worker = new Worker[cores]; @@ -81,46 +80,48 @@ public class DocumentIndex extends Segment { this.worker[i].start(); } } - + class Worker extends Thread { - public Worker(int count) { + public Worker(final int count) { super(workerThreadGroup, "query-" + count); } - + @Override public void run() { DigestURI f; - URIMetadataRow resultRow; + URIMetadataRow[] resultRows; try { - while ((f = queue.take()) != poison) try { - resultRow = add(f); - if (callback != null) { - if (resultRow == null) { - callback.fail(f, "result is null"); - } else { - callback.commit(f, resultRow); + while ((f = DocumentIndex.this.queue.take()) != poison) try { + resultRows = add(f); + for (final URIMetadataRow resultRow: resultRows) { + if (DocumentIndex.this.callback != null) { + if (resultRow == null) { + DocumentIndex.this.callback.fail(f, "result is null"); + } else { + DocumentIndex.this.callback.commit(f, resultRow); + } } } - } catch (IOException e) { + } catch (final IOException e) { if (e.getMessage().indexOf("cannot parse") < 0) Log.logException(e); - callback.fail(f, e.getMessage()); + DocumentIndex.this.callback.fail(f, e.getMessage()); } - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} } } - + /** * get the number of pending documents in the indexing queue */ public int pending() { return this.queue.size(); } - + public void clearQueue() { this.queue.clear(); } - - private URIMetadataRow add(DigestURI url) throws IOException { + + private URIMetadataRow[] add(final DigestURI url) throws IOException { if (url == null) throw new IOException("file = null"); if (url.isDirectory()) throw new IOException("file should be a document, not a path"); if (!url.canRead()) throw new IOException("cannot read file"); @@ -128,17 +129,20 @@ public class DocumentIndex extends Segment { long length; try { length = url.length(); - } catch (Exception e) { + } catch (final Exception e) { length = -1; } try { - documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1)); - } catch (Exception e) { + documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1), true); + } catch (final Exception e) { throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } - Document document = Document.mergeDocuments(url, null, documents); - final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib); - return super.storeDocument( + //Document document = Document.mergeDocuments(url, null, documents); + final URIMetadataRow[] rows = new URIMetadataRow[documents.length]; + int c = 0; + for (final Document document: documents) { + final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib); + rows[c++] = super.storeDocument( url, null, new Date(url.lastModified()), @@ -149,25 +153,27 @@ public class DocumentIndex extends Segment { null, DocumentIndex.class.getName() + ".add" ); + } + return rows; } - + /** * add a file or a directory of files to the index * If the given file is a path to a directory, the complete sub-tree is indexed * @param start */ - public void addConcurrent(DigestURI start) throws IOException { + public void addConcurrent(final DigestURI start) throws IOException { assert (start != null); assert (start.canRead()) : start.toString(); if (!start.isDirectory()) { try { this.queue.put(start); - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} return; } - String[] s = start.list(); + final String[] s = start.list(); DigestURI w; - for (String t: s) { + for (final String t: s) { try { w = new DigestURI(start, t); if (w.canRead() && !w.isHidden()) { @@ -176,31 +182,31 @@ public class DocumentIndex extends Segment { } else { try { this.queue.put(w); - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} } } - } catch (MalformedURLException e1) { + } catch (final MalformedURLException e1) { Log.logException(e1); } } } - + /** * do a full-text search of a given string and return a specific number of results * @param querystring * @param count * @return a list of files that contain the given string - */ - public ArrayList find(String querystring, int count) { + */ + public ArrayList find(final String querystring, int count) { // make a query and start a search - QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex"); - ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang)); - RankingProcess rankedCache = new RankingProcess(query, order, SearchEvent.max_results_preparation); + final QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex"); + final ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang)); + final RankingProcess rankedCache = new RankingProcess(query, order, SearchEvent.max_results_preparation); rankedCache.start(); - + // search is running; retrieve results URIMetadataRow row; - ArrayList files = new ArrayList(); + final ArrayList files = new ArrayList(); Components metadata; while ((row = rankedCache.takeURL(false, 1000)) != null) { metadata = row.metadata(); @@ -211,7 +217,7 @@ public class DocumentIndex extends Segment { } return files; } - + /** * close the index. * This terminates all worker threads and then closes the segment. @@ -219,27 +225,27 @@ public class DocumentIndex extends Segment { @Override public void close() { // send termination signal to worker threads - for (int i = 0; i < this.worker.length; i++) { + for (final Worker element : this.worker) { try { this.queue.put(poison); - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} } // wait for termination - for (int i = 0; i < this.worker.length; i++) { + for (final Worker element : this.worker) { try { - this.worker[i].join(); - } catch (InterruptedException e) {} + element.join(); + } catch (final InterruptedException e) {} } // close the segment super.close(); } - + public interface CallbackListener { public void commit(DigestURI f, URIMetadataRow resultRow); public void fail(DigestURI f, String failReason); } - - public static void main(String[] args) { + + public static void main(final String[] args) { // first argument: path to segment // second argument: either 'add' or 'search' // third and more arguments exists only in case that second argument is 'search': these are then the search words @@ -249,37 +255,37 @@ public class DocumentIndex extends Segment { // DocumentIndex yacyindex search steht System.setProperty("java.awt.headless", "true"); if (args.length < 3) return; - File segmentPath = new File(args[0]); + final File segmentPath = new File(args[0]); System.out.println("using index files at " + segmentPath.getAbsolutePath()); - CallbackListener callback = new CallbackListener() { - public void commit(DigestURI f, URIMetadataRow resultRow) { + final CallbackListener callback = new CallbackListener() { + public void commit(final DigestURI f, final URIMetadataRow resultRow) { System.out.println("indexed: " + f.toString()); } - public void fail(DigestURI f, String failReason) { + public void fail(final DigestURI f, final String failReason) { System.out.println("not indexed " + f.toString() + ": " + failReason); } }; try { if (args[1].equals("add")) { - DigestURI f = new DigestURI(args[2]); - DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); + final DigestURI f = new DigestURI(args[2]); + final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); di.addConcurrent(f); di.close(); } else { String query = ""; for (int i = 2; i < args.length; i++) query += args[i]; query.trim(); - DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); - ArrayList results = di.find(query, 100); - for (DigestURI f: results) { + final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); + final ArrayList results = di.find(query, 100); + for (final DigestURI f: results) { if (f != null) System.out.println(f.toString()); } di.close(); } - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } //System.exit(0); } - + } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 48f63e0be..e5ea8f545 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1921,7 +1921,7 @@ public final class Switchboard extends serverSwitch { assert response.getContent() != null; try { // parse the document - documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent()); + documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), getConfigBool("crawler.embedLinksAsDocuments", false)); if (documents == null) { throw new Parser.Failure("Parser returned null.", response.url()); } diff --git a/source/net/yacy/document/AbstractParser.java b/source/net/yacy/document/AbstractParser.java index e168ff07a..3c82f2e01 100644 --- a/source/net/yacy/document/AbstractParser.java +++ b/source/net/yacy/document/AbstractParser.java @@ -7,12 +7,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -29,27 +29,27 @@ import java.util.Set; import net.yacy.kelondro.logging.Log; public abstract class AbstractParser implements Parser { - + protected final Log log = new Log("PARSER"); protected final Set SUPPORTED_MIME_TYPES = new HashSet(); protected final Set SUPPORTED_EXTENSIONS = new HashSet(); private final String name; - + /** * initialize a parser with a name * @param name */ - public AbstractParser(String name) { + public AbstractParser(final String name) { this.name = name; } - + /** * return the name of the parser */ public String getName() { return this.name; } - + /** * each parser must define a set of supported mime types * @return a set of mime type strings that are supported @@ -57,7 +57,7 @@ public abstract class AbstractParser implements Parser { public Set supportedMimeTypes() { return this.SUPPORTED_MIME_TYPES; } - + /** * each parser must define a set of supported file extensions * @return a set of file name extensions that are supported @@ -65,22 +65,22 @@ public abstract class AbstractParser implements Parser { public Set supportedExtensions() { return this.SUPPORTED_EXTENSIONS; } - + /** * check equivalence of parsers; this simply tests equality of parser names * @param o * @return */ - public boolean equals(Object o) { - return this.getName().equals(((Parser) o).getName()); + public boolean equals(final Object o) { + return getName().equals(((Parser) o).getName()); } - + /** * the hash code of a parser * @return the hash code of the parser name string */ public int hashCode() { - return this.getName().hashCode(); + return getName().hashCode(); } } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index d334c5a12..666d64358 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -131,6 +131,10 @@ public class Document { return this.parserObject; } + public Set getContentLanguages() { + return this.languages; + } + /** * compute a set of languages that this document contains * the language is not computed using a statistical analysis of the content, only from given metadata that came with the document diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 569293d6a..018982820 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -31,6 +31,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -58,6 +59,7 @@ import net.yacy.document.parser.vcfParser; import net.yacy.document.parser.vsdParser; import net.yacy.document.parser.xlsParser; import net.yacy.document.parser.zipParser; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.images.genericImageParser; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -141,7 +143,8 @@ public final class TextParser { final MultiProtocolURI location, final String mimeType, final String charset, - final File sourceFile + final File sourceFile, + final boolean multipleVirtualDocs ) throws InterruptedException, Parser.Failure { BufferedInputStream sourceStream = null; @@ -154,7 +157,7 @@ public final class TextParser { throw new Parser.Failure(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); - docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream); + docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream, multipleVirtualDocs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -164,6 +167,7 @@ public final class TextParser { if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {} } for (final Document d: docs) { assert d.getText() != null; } // verify docs + return docs; } @@ -171,7 +175,8 @@ public final class TextParser { final MultiProtocolURI location, String mimeType, final String charset, - final byte[] content + final byte[] content, + final boolean multipleVirtualDocs ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array"); mimeType = normalizeMimeType(mimeType); @@ -185,7 +190,12 @@ public final class TextParser { } assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true, false); - return parseSource(location, mimeType, idioms, charset, content); + Document[] docs = parseSource(location, mimeType, idioms, charset, content); + + // finally enrich the docs set with virtual docs from the enclosed documents + if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]); + + return docs; } public static Document[] parseSource( @@ -193,7 +203,8 @@ public final class TextParser { String mimeType, final String charset, final long contentLength, - final InputStream sourceStream + final InputStream sourceStream, + final boolean multipleVirtualDocs ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); mimeType = normalizeMimeType(mimeType); @@ -222,7 +233,12 @@ public final class TextParser { } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } - return parseSource(location, mimeType, idioms, charset, b); + Document[] docs = parseSource(location, mimeType, idioms, charset, b); + + // finally enrich the docs set with virtual docs from the enclosed documents + if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]); + + return docs; } private static Document[] parseSource( @@ -292,6 +308,7 @@ public final class TextParser { } } for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs + return docs; } @@ -429,4 +446,73 @@ public final class TextParser { if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v); } + /** + * produce virtual documents for each of the link that is contained in the document + * @param document + * @return + */ + public static Document[] virtualDocs(final Document document) { + + final ArrayList docs = new ArrayList(); + docs.add(document); + for (final Map.Entry link: document.getApplinks().entrySet()) { + docs.add(genLinkDocs(docs, "application", link.getKey(), link.getValue(), document.getContentLanguages())); + } + for (final Map.Entry link: document.getAudiolinks().entrySet()) { + docs.add(genLinkDocs(docs, "audio", link.getKey(), link.getValue(), document.getContentLanguages())); + } + for (final Map.Entry link: document.getVideolinks().entrySet()) { + docs.add(genLinkDocs(docs, "video", link.getKey(), link.getValue(), document.getContentLanguages())); + } + for (final Entry link: document.getImages().entrySet()) { + docs.add(genImageDocs(docs, link.getValue())); + } + + // finally return the list of documents + return docs.toArray(new Document[docs.size()]); + } + + private final static Document genLinkDocs(final ArrayList docs, final String type, final MultiProtocolURI uri, final String descr, final Set contentLanguages) { + //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr); + return new Document( + uri, + Classification.ext2mime(uri.getFileExtension()), + "UTF-8", + null, + contentLanguages, + null, + descr, + "", + "", + new String[]{descr}, + type, + 0.0f, 0.0f, + uri.toNormalform(false, false), + null, + null, + null, + false); + } + + private final static Document genImageDocs(final ArrayList docs, final ImageEntry img) { + //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt()); + return new Document( + img.url(), + Classification.ext2mime(img.url().getFileExtension()), + "UTF-8", + null, + null, + null, + img.alt(), + "", + "", + new String[]{img.alt()}, + "image", + 0.0f, 0.0f, + img.url().toNormalform(false, false), + null, + null, + null, + false); + } } diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 5ade909da..ef12b49e4 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -2,19 +2,19 @@ * MediawikiImporter * Copyright 2008 by Michael Peter Christen * First released 20.11.2008 at http://yacy.net - * + * * This is a part of YaCy, a peer-to-peer based web search engine * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -22,17 +22,6 @@ package net.yacy.document.importer; -import net.yacy.cora.document.UTF8; -import net.yacy.document.Document; -import net.yacy.document.Parser; -import net.yacy.document.TextParser; -import net.yacy.document.content.SurrogateReader; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.ByteBuffer; - -import org.apache.tools.bzip2.CBZip2InputStream; - import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; @@ -61,6 +50,17 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.zip.GZIPInputStream; +import net.yacy.cora.document.UTF8; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.TextParser; +import net.yacy.document.content.SurrogateReader; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.ByteBuffer; + +import org.apache.tools.bzip2.CBZip2InputStream; + import de.anomic.data.wiki.WikiCode; import de.anomic.data.wiki.WikiParser; @@ -78,9 +78,9 @@ public class MediawikiImporter extends Thread implements Importer { private static final byte[] pagestartb = UTF8.getBytes(pagestart); private static final byte[] pageendb = UTF8.getBytes(pageend); private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 mediawiki dump - + public static Importer job; // if started from a servlet, this object is used to store the thread - + public File sourcefile; public File targetdir; public int count; @@ -88,100 +88,100 @@ public class MediawikiImporter extends Thread implements Importer { private final long docsize; private final int approxdocs; private String hostport, urlStub; - - - public MediawikiImporter(File sourcefile, File targetdir) { + + + public MediawikiImporter(final File sourcefile, final File targetdir) { this.sourcefile = sourcefile; this.docsize = sourcefile.length(); - this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L); + this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L); this.targetdir = targetdir; this.count = 0; this.start = 0; this.hostport = null; this.urlStub = null; } - + public int count() { return this.count; } - + public String source() { return this.sourcefile.getAbsolutePath(); } - + public String status() { return ""; } - + /** * return the number of articles per second * @return */ public int speed() { - if (count == 0) return 0; - return (int) ((long) count / Math.max(1L, runningTime() )); + if (this.count == 0) return 0; + return (int) (this.count / Math.max(1L, runningTime() )); } - + /** * return the remaining seconds for the completion of all records in milliseconds * @return */ public long remainingTime() { - return Math.max(0, this.approxdocs - count) / Math.max(1, speed() ); + return Math.max(0, this.approxdocs - this.count) / Math.max(1, speed() ); } - + public long runningTime() { - return (System.currentTimeMillis() - start) / 1000L; + return (System.currentTimeMillis() - this.start) / 1000L; } - + public void run() { this.start = System.currentTimeMillis(); try { - String targetstub = sourcefile.getName(); + String targetstub = this.sourcefile.getName(); int p = targetstub.lastIndexOf("\\."); if (p > 0) targetstub = targetstub.substring(0, p); - InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1024 * 1024); - if (sourcefile.getName().endsWith(".bz2")) { + InputStream is = new BufferedInputStream(new FileInputStream(this.sourcefile), 1024 * 1024); + if (this.sourcefile.getName().endsWith(".bz2")) { int b = is.read(); if (b != 'B') throw new IOException("Invalid bz2 content."); b = is.read(); if (b != 'Z') throw new IOException("Invalid bz2 content."); is = new CBZip2InputStream(is); - } else if (sourcefile.getName().endsWith(".gz")) { + } else if (this.sourcefile.getName().endsWith(".gz")) { is = new GZIPInputStream(is); } - BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024); + final BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024); String t; StringBuilder sb = new StringBuilder(); boolean page = false, text = false; String title = null; - wikiparserrecord poison = newRecord(); - int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1); - BlockingQueue in = new ArrayBlockingQueue(threads * 10); - BlockingQueue out = new ArrayBlockingQueue(threads * 10); - ExecutorService service = Executors.newFixedThreadPool(threads + 1); - convertConsumer[] consumers = new convertConsumer[threads]; - Future[] consumerResults = new Future[threads]; + final wikiparserrecord poison = newRecord(); + final int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1); + final BlockingQueue in = new ArrayBlockingQueue(threads * 10); + final BlockingQueue out = new ArrayBlockingQueue(threads * 10); + final ExecutorService service = Executors.newFixedThreadPool(threads + 1); + final convertConsumer[] consumers = new convertConsumer[threads]; + final Future[] consumerResults = new Future[threads]; for (int i = 0; i < threads; i++) { consumers[i] = new convertConsumer(in, out, poison); consumerResults[i] = service.submit(consumers[i]); } - convertWriter writer = new convertWriter(out, poison, targetdir, targetstub); - Future writerResult = service.submit(writer); - + final convertWriter writer = new convertWriter(out, poison, this.targetdir, targetstub); + final Future writerResult = service.submit(writer); + wikiparserrecord record; int q; while ((t = r.readLine()) != null) { if ((p = t.indexOf("")) >= 0 && (q = t.indexOf("", p)) > 0) { //urlStub = "http://" + lang + ".wikipedia.org/wiki/"; - urlStub = t.substring(p + 6, q); - if (!urlStub.endsWith("/")) { - q = urlStub.lastIndexOf('/'); - if (q > 0) urlStub = urlStub.substring(0, q + 1); + this.urlStub = t.substring(p + 6, q); + if (!this.urlStub.endsWith("/")) { + q = this.urlStub.lastIndexOf('/'); + if (q > 0) this.urlStub = this.urlStub.substring(0, q + 1); } - DigestURI uri = new DigestURI(urlStub); - hostport = uri.getHost(); - if (uri.getPort() != 80) hostport += ":" + uri.getPort(); + final DigestURI uri = new DigestURI(this.urlStub); + this.hostport = uri.getHost(); + if (uri.getPort() != 80) this.hostport += ":" + uri.getPort(); continue; } if (t.indexOf(pagestart) >= 0) { @@ -192,7 +192,7 @@ public class MediawikiImporter extends Thread implements Importer { text = page; q = t.indexOf('>', p + textstart.length()); if (q > 0) { - int u = t.indexOf(textend, q + 1); + final int u = t.indexOf(textend, q + 1); if (u > q) { sb.append(t.substring(q + 1, u)); Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title); @@ -200,11 +200,11 @@ public class MediawikiImporter extends Thread implements Importer { Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content"); continue; } - record = newRecord(hostport, urlStub, title, sb); + record = newRecord(this.hostport, this.urlStub, title, sb); try { in.put(record); this.count++; - } catch (InterruptedException e1) { + } catch (final InterruptedException e1) { Log.logException(e1); } sb = new StringBuilder(200); @@ -222,11 +222,11 @@ public class MediawikiImporter extends Thread implements Importer { Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content"); continue; } - record = newRecord(hostport, urlStub, title, sb); + record = newRecord(this.hostport, this.urlStub, title, sb); try { in.put(record); this.count++; - } catch (InterruptedException e1) { + } catch (final InterruptedException e1) { Log.logException(e1); } sb = new StringBuilder(200); @@ -248,7 +248,7 @@ public class MediawikiImporter extends Thread implements Importer { } } r.close(); - + try { for (int i = 0; i < threads; i++) { in.put(poison); @@ -258,35 +258,35 @@ public class MediawikiImporter extends Thread implements Importer { } out.put(poison); writerResult.get(10000, TimeUnit.MILLISECONDS); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); - } catch (ExecutionException e) { + } catch (final ExecutionException e) { Log.logException(e); - } catch (TimeoutException e) { + } catch (final TimeoutException e) { Log.logException(e); - } catch (Exception e) { + } catch (final Exception e) { Log.logException(e); } - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); - } catch (Exception e) { + } catch (final Exception e) { Log.logException(e); } } - - public static void checkIndex(File mediawikixml) { - File idx = idxFromMediawikiXML(mediawikixml); + + public static void checkIndex(final File mediawikixml) { + final File idx = idxFromMediawikiXML(mediawikixml); if (idx.exists()) return; new indexMaker(mediawikixml).start(); } - + public static class indexMaker extends Thread { - + File mediawikixml; - public indexMaker(File mediawikixml) { + public indexMaker(final File mediawikixml) { this.mediawikixml = mediawikixml; } - + public void run() { try { createIndex(this.mediawikixml); @@ -296,24 +296,24 @@ public class MediawikiImporter extends Thread implements Importer { } } } - - public static File idxFromMediawikiXML(File mediawikixml) { + + public static File idxFromMediawikiXML(final File mediawikixml) { return new File(mediawikixml.getAbsolutePath() + ".idx.xml"); } - - public static void createIndex(File dumpFile) throws IOException { + + public static void createIndex(final File dumpFile) throws IOException { // calculate md5 //String md5 = serverCodings.encodeMD5Hex(dumpFile); - + // init reader, producer and consumer - PositionAwareReader in = new PositionAwareReader(dumpFile); - indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile)); - wikiConsumer consumer = new wikiConsumer(100, producer); - ExecutorService service = Executors.newFixedThreadPool(2); - Future producerResult = service.submit(consumer); - Future consumerResult = service.submit(producer); + final PositionAwareReader in = new PositionAwareReader(dumpFile); + final indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile)); + final wikiConsumer consumer = new wikiConsumer(100, producer); + final ExecutorService service = Executors.newFixedThreadPool(2); + final Future producerResult = service.submit(consumer); + final Future consumerResult = service.submit(producer); service.shutdown(); - + // read the wiki dump long start, stop; while (in.seek(pagestartb)) { @@ -324,18 +324,18 @@ public class MediawikiImporter extends Thread implements Importer { consumer.consume(new wikiraw(in.bytes(), start, stop)); in.resetBuffer(); } - + // shut down the services try { consumer.consume(wikiConsumer.poison); - try {consumerResult.get(5000, TimeUnit.MILLISECONDS);} catch (TimeoutException e) {} + try {consumerResult.get(5000, TimeUnit.MILLISECONDS);} catch (final TimeoutException e) {} producer.consume(indexProducer.poison); if (!consumerResult.isDone()) consumerResult.get(); producerResult.get(); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); return; - } catch (ExecutionException e) { + } catch (final ExecutionException e) { Log.logException(e); return; } @@ -348,120 +348,120 @@ public class MediawikiImporter extends Thread implements Importer { PrintWriter out; protected static wikisourcerecord poison = new wikisourcerecord("", 0, 0); int count; - - public indexProducer(int bufferCount, File indexFile) throws IOException { - entries = new ArrayBlockingQueue(bufferCount); - out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile))); - count = 0; - out.println(""); - - } - - public void consume(wikisourcerecord b) { + + public indexProducer(final int bufferCount, final File indexFile) throws IOException { + this.entries = new ArrayBlockingQueue(bufferCount); + this.out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile))); + this.count = 0; + this.out.println(""); + + } + + public void consume(final wikisourcerecord b) { try { - entries.put(b); - } catch (InterruptedException e) { + this.entries.put(b); + } catch (final InterruptedException e) { Log.logException(e); } } - + public Integer call() { wikisourcerecord r; try { while(true) { - r = entries.take(); + r = this.entries.take(); if (r == poison) { Log.logInfo("WIKITRANSLATION", "producer / got poison"); break; } - out.println(" "); - out.println(" " + r.title + ""); - out.println(" "); + this.out.println(" "); + this.out.println(" " + r.title + ""); + this.out.println(" "); Log.logInfo("WIKITRANSLATION", "producer / record start: " + r.start + ", title : " + r.title); - count++; + this.count++; } - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } - entries.clear(); - out.println(""); - out.close(); - return Integer.valueOf(count); + this.entries.clear(); + this.out.println(""); + this.out.close(); + return Integer.valueOf(this.count); } - + } - + private static class wikiConsumer implements Callable { private final BlockingQueue entries; protected static wikiraw poison = new wikiraw(new byte[0], 0, 0); private final indexProducer producer; private int count; - - public wikiConsumer(int bufferCount, indexProducer producer) { - entries = new ArrayBlockingQueue(bufferCount); + + public wikiConsumer(final int bufferCount, final indexProducer producer) { + this.entries = new ArrayBlockingQueue(bufferCount); this.producer = producer; - count = 0; + this.count = 0; } - - public void consume(wikiraw b) { + + public void consume(final wikiraw b) { try { - entries.put(b); - } catch (InterruptedException e) { + this.entries.put(b); + } catch (final InterruptedException e) { Log.logException(e); } } - + public Integer call() { wikisourcerecord r; wikiraw c; try { while(true) { - c = entries.take(); + c = this.entries.take(); if (c == poison) { Log.logInfo("WIKITRANSLATION", "consumer / got poison"); break; } try { r = new wikisourcerecord(c.b, c.start, c.end); - producer.consume(r); + this.producer.consume(r); Log.logInfo("WIKITRANSLATION", "consumer / record start: " + r.start + ", title : " + r.title); - count++; - } catch (RuntimeException e) {} + this.count++; + } catch (final RuntimeException e) {} } - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } - entries.clear(); - return Integer.valueOf(count); + this.entries.clear(); + return Integer.valueOf(this.count); } - + } private static class wikiraw { public long start, end; public byte[] b; - public wikiraw(byte[] b, long start, long end) { + public wikiraw(final byte[] b, final long start, final long end) { this.b = b; this.start = start; this.end = end; } } - + public static class wikisourcerecord { public long start, end; public String title; - public wikisourcerecord(String title, long start, long end) { + public wikisourcerecord(final String title, final long start, final long end) { this.title = title; this.start = start; this.end = end; } - public wikisourcerecord(byte[] chunk, long start, long end) { + public wikisourcerecord(final byte[] chunk, final long start, final long end) { String s; s = UTF8.String(chunk); - int t0 = s.indexOf(""); + final int t0 = s.indexOf("<title>"); if (t0 >= 0) { - int t1 = s.indexOf("", t0); + final int t1 = s.indexOf("", t0); if (t1 >= 0) { this.title = s.substring(t0 + 7, t1); } else { @@ -470,7 +470,7 @@ public class MediawikiImporter extends Thread implements Importer { } else { throw new RuntimeException("no title start in record"); } - + this.start = start; this.end = end; } @@ -478,16 +478,16 @@ public class MediawikiImporter extends Thread implements Importer { public wikiparserrecord newRecord() { return new wikiparserrecord(null, null, null, null); } - public wikiparserrecord newRecord(String hostport, String urlStub, String title, StringBuilder sb) { + public wikiparserrecord newRecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) { return new wikiparserrecord(hostport, urlStub, title, sb); } - + public class wikiparserrecord { public String title; String source, html, hostport, urlStub; DigestURI url; Document document; - public wikiparserrecord(String hostport, String urlStub, String title, StringBuilder sb) { + public wikiparserrecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) { this.title = title; this.hostport = hostport; this.urlStub = urlStub; @@ -495,97 +495,97 @@ public class MediawikiImporter extends Thread implements Importer { } public void genHTML() throws IOException { try { - WikiParser wparser = new WikiCode(); - html = wparser.transform(hostport, source); - } catch (Exception e) { + final WikiParser wparser = new WikiCode(); + this.html = wparser.transform(this.hostport, this.source); + } catch (final Exception e) { Log.logException(e); throw new IOException(e.getMessage()); } } public void genDocument() throws Parser.Failure { try { - url = new DigestURI(urlStub + title); - Document[] parsed = TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html)); - document = Document.mergeDocuments(url, "text/html", parsed); + this.url = new DigestURI(this.urlStub + this.title); + final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html), false); + this.document = Document.mergeDocuments(this.url, "text/html", parsed); // the wiki parser is not able to find the proper title in the source text, so it must be set here - document.setTitle(title); - } catch (MalformedURLException e1) { + this.document.setTitle(this.title); + } catch (final MalformedURLException e1) { Log.logException(e1); } } - public void writeXML(OutputStreamWriter os) throws IOException { - document.writeXML(os, new Date()); + public void writeXML(final OutputStreamWriter os) throws IOException { + this.document.writeXML(os, new Date()); } } - + private static class PositionAwareReader { - + private final InputStream is; private long seekpos; private ByteBuffer bb; - - public PositionAwareReader(File dumpFile) throws FileNotFoundException { + + public PositionAwareReader(final File dumpFile) throws FileNotFoundException { this.is = new BufferedInputStream(new FileInputStream(dumpFile), 64 *1024); this.seekpos = 0; this.bb = new ByteBuffer(); } - + public void resetBuffer() { - if (bb.length() > 10 * 1024) bb = new ByteBuffer(); else bb.clear(); + if (this.bb.length() > 10 * 1024) this.bb = new ByteBuffer(); else this.bb.clear(); } - - public boolean seek(byte[] pattern) throws IOException { + + public boolean seek(final byte[] pattern) throws IOException { int pp = 0; int c; - while ((c = is.read()) >= 0) { - seekpos++; - bb.append(c); + while ((c = this.is.read()) >= 0) { + this.seekpos++; + this.bb.append(c); if (pattern[pp] == c) pp++; else pp = 0; if (pp == pattern.length) return true; } return false; } - + public long pos() { - return seekpos; + return this.seekpos; } - + public byte[] bytes() { - return bb.getBytes(); + return this.bb.getBytes(); } - + public void close() { try { - is.close(); - } catch (IOException e) { + this.is.close(); + } catch (final IOException e) { Log.logException(e); } } } - public static byte[] read(File f, long start, int len) { - byte[] b = new byte[len]; + public static byte[] read(final File f, final long start, final int len) { + final byte[] b = new byte[len]; RandomAccessFile raf = null; try { raf = new RandomAccessFile(f, "r"); raf.seek(start); raf.read(b); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); return null; } finally { if (raf != null) try { raf.close(); - try{raf.getChannel().close();} catch (IOException e) {} - } catch (IOException e) { } + try{raf.getChannel().close();} catch (final IOException e) {} + } catch (final IOException e) { } } return b; } - - public static wikisourcerecord find(String title, File f) throws IOException { - PositionAwareReader in = new PositionAwareReader(f); + + public static wikisourcerecord find(final String title, final File f) throws IOException { + final PositionAwareReader in = new PositionAwareReader(f); long start; - String m = "" + title + ""; + final String m = "" + title + ""; String s; while (in.seek(UTF8.getBytes(" { private final BlockingQueue in; @@ -666,12 +666,12 @@ public class MediawikiImporter extends Thread implements Importer { private final File targetdir; private int fc, rc; private String outputfilename; - + public convertWriter( - BlockingQueue in, - wikiparserrecord poison, - File targetdir, - String targetstub) { + final BlockingQueue in, + final wikiparserrecord poison, + final File targetdir, + final String targetstub) { this.poison = poison; this.in = in; this.osw = null; @@ -681,63 +681,63 @@ public class MediawikiImporter extends Thread implements Importer { this.rc = 0; this.outputfilename = null; } - + public Integer call() { wikiparserrecord record; try { while(true) { - record = in.take(); - if (record == poison) { + record = this.in.take(); + if (record == this.poison) { Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison"); break; } - - if (osw == null) { + + if (this.osw == null) { // start writing a new file - this.outputfilename = targetstub + "." + fc + ".xml.prt"; - this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8"); - osw.write("\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n"); + this.outputfilename = this.targetstub + "." + this.fc + ".xml.prt"; + this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(this.targetdir, this.outputfilename))), "UTF-8"); + this.osw.write("\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n"); } Log.logInfo("WIKITRANSLATION", "[CONSUME] Title: " + record.title); - record.document.writeXML(osw, new Date()); - rc++; - if (rc >= 10000) { - osw.write("\n"); - osw.close(); - String finalfilename = targetstub + "." + fc + ".xml"; - new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename)); - rc = 0; - fc++; - outputfilename = targetstub + "." + fc + ".xml.prt"; - osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8"); - osw.write("\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n"); + record.document.writeXML(this.osw, new Date()); + this.rc++; + if (this.rc >= 10000) { + this.osw.write("\n"); + this.osw.close(); + final String finalfilename = this.targetstub + "." + this.fc + ".xml"; + new File(this.targetdir, this.outputfilename).renameTo(new File(this.targetdir, finalfilename)); + this.rc = 0; + this.fc++; + this.outputfilename = this.targetstub + "." + this.fc + ".xml.prt"; + this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(this.targetdir, this.outputfilename))), "UTF-8"); + this.osw.write("\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n"); } } - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); - } catch (UnsupportedEncodingException e) { + } catch (final UnsupportedEncodingException e) { Log.logException(e); - } catch (FileNotFoundException e) { + } catch (final FileNotFoundException e) { Log.logException(e); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } finally { try { - osw.write(SurrogateReader.SURROGATES_MAIN_ELEMENT_CLOSE + "\n"); - osw.close(); - String finalfilename = targetstub + "." + fc + ".xml"; - new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename)); - } catch (IOException e) { + this.osw.write(SurrogateReader.SURROGATES_MAIN_ELEMENT_CLOSE + "\n"); + this.osw.close(); + final String finalfilename = this.targetstub + "." + this.fc + ".xml"; + new File(this.targetdir, this.outputfilename).renameTo(new File(this.targetdir, finalfilename)); + } catch (final IOException e) { Log.logException(e); } } Log.logInfo("WIKITRANSLATION", "*** convertWriter has terminated"); return Integer.valueOf(0); } - + } - - public static void main(String[] s) { + + public static void main(final String[] s) { if (s.length == 0) { Log.logInfo("WIKITRANSLATION", "usage:"); Log.logInfo("WIKITRANSLATION", " -index "); @@ -751,47 +751,47 @@ public class MediawikiImporter extends Thread implements Importer { // java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/ if (s[0].equals("-convert") && s.length > 2) { - File sourcefile = new File(s[1]); - File targetdir = new File(s[2]); + final File sourcefile = new File(s[1]); + final File targetdir = new File(s[2]); //String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/ //String language = urlStub.substring(7,9); try { - MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir); + final MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir); mi.start(); mi.join(); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } } - - if (s[0].equals("-index")) { + + if (s[0].equals("-index")) { try { createIndex(new File(s[1])); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } } - + if (s[0].equals("-read")) { - long start = Integer.parseInt(s[1]); - int len = Integer.parseInt(s[2]); + final long start = Integer.parseInt(s[1]); + final int len = Integer.parseInt(s[2]); System.out.println(UTF8.String(read(new File(s[3]), start, len))); } - + if (s[0].equals("-find")) { try { - wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml")); + final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml")); if (w == null) { Log.logInfo("WIKITRANSLATION", "not found"); } else { System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start)))); } - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } - + } System.exit(0); } - + } diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 7e010ff33..e2dba1b6c 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -1,4 +1,4 @@ -//bzipParser.java +//bzipParser.java //------------------------ //part of YaCy //(C) by Michael Peter Christen; mc@yacy.net @@ -42,26 +42,26 @@ import org.apache.tools.bzip2.CBZip2InputStream; public class bzipParser extends AbstractParser implements Parser { - - public bzipParser() { + + public bzipParser() { super("Bzip 2 UNIX Compressed File Parser"); - SUPPORTED_EXTENSIONS.add("bz2"); - SUPPORTED_EXTENSIONS.add("tbz"); - SUPPORTED_EXTENSIONS.add("tbz2"); - SUPPORTED_MIME_TYPES.add("application/x-bzip2"); - SUPPORTED_MIME_TYPES.add("application/bzip2"); - SUPPORTED_MIME_TYPES.add("application/x-bz2"); - SUPPORTED_MIME_TYPES.add("application/x-bzip"); - SUPPORTED_MIME_TYPES.add("application/x-stuffit"); + this.SUPPORTED_EXTENSIONS.add("bz2"); + this.SUPPORTED_EXTENSIONS.add("tbz"); + this.SUPPORTED_EXTENSIONS.add("tbz2"); + this.SUPPORTED_MIME_TYPES.add("application/x-bzip2"); + this.SUPPORTED_MIME_TYPES.add("application/bzip2"); + this.SUPPORTED_MIME_TYPES.add("application/x-bz2"); + this.SUPPORTED_MIME_TYPES.add("application/x-bzip"); + this.SUPPORTED_MIME_TYPES.add("application/x-stuffit"); } - + public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { - + File tempFile = null; Document[] docs; - try { + try { /* * First we have to consume the first two char from the stream. Otherwise * the bzip decompression will fail with a nullpointerException! @@ -73,31 +73,31 @@ public class bzipParser extends AbstractParser implements Parser { b = source.read(); if (b != 'Z') { throw new Exception("Invalid bz2 content."); - } - + } + int read = 0; final byte[] data = new byte[1024]; - final CBZip2InputStream zippedContent = new CBZip2InputStream(source); - + final CBZip2InputStream zippedContent = new CBZip2InputStream(source); + tempFile = File.createTempFile("bunzip","tmp"); tempFile.deleteOnExit(); - + // creating a temp file to store the uncompressed data final FileOutputStream out = new FileOutputStream(tempFile); - + // reading gzip file and store it uncompressed while((read = zippedContent.read(data, 0, 1024)) != -1) { out.write(data, 0, read); } zippedContent.close(); out.close(); - + // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, tempFile); - } catch (final Exception e) { + docs = TextParser.parseSource(location, null, null, tempFile, false); + } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; - + throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location); } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index f3452b6c3..0680b9e22 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -1,4 +1,4 @@ -//gzipParser.java +//gzipParser.java //------------------------ //part of YaCy //(C) by Michael Peter Christen; mc@yacy.net @@ -42,52 +42,52 @@ import net.yacy.kelondro.util.FileUtils; public class gzipParser extends AbstractParser implements Parser { - public gzipParser() { + public gzipParser() { super("GNU Zip Compressed Archive Parser"); - SUPPORTED_EXTENSIONS.add("gz"); - SUPPORTED_EXTENSIONS.add("tgz"); - SUPPORTED_MIME_TYPES.add("application/x-gzip"); - SUPPORTED_MIME_TYPES.add("application/gzip"); - SUPPORTED_MIME_TYPES.add("application/x-gunzip"); - SUPPORTED_MIME_TYPES.add("application/gzipped"); - SUPPORTED_MIME_TYPES.add("application/gzip-compressed"); - SUPPORTED_MIME_TYPES.add("gzip/document"); + this.SUPPORTED_EXTENSIONS.add("gz"); + this.SUPPORTED_EXTENSIONS.add("tgz"); + this.SUPPORTED_MIME_TYPES.add("application/x-gzip"); + this.SUPPORTED_MIME_TYPES.add("application/gzip"); + this.SUPPORTED_MIME_TYPES.add("application/x-gunzip"); + this.SUPPORTED_MIME_TYPES.add("application/gzipped"); + this.SUPPORTED_MIME_TYPES.add("application/gzip-compressed"); + this.SUPPORTED_MIME_TYPES.add("gzip/document"); } - + public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { - + File tempFile = null; Document[] docs = null; - try { + try { int read = 0; final byte[] data = new byte[1024]; - + final GZIPInputStream zippedContent = new GZIPInputStream(source); - + tempFile = File.createTempFile("gunzip","tmp"); tempFile.deleteOnExit(); - + // creating a temp file to store the uncompressed data final FileOutputStream out = new FileOutputStream(tempFile); - + // reading gzip file and store it uncompressed while ((read = zippedContent.read(data, 0, 1024)) != -1) { out.write(data, 0, read); } zippedContent.close(); out.close(); - + // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location,null,null,tempFile); - } catch (final Exception e) { + docs = TextParser.parseSource(location,null,null,tempFile, false); + } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; - - throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location); + + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location); } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } return docs; } - + } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index f248ad99d..69c9f078d 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -32,20 +32,15 @@ import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; -import java.util.ArrayList; -import java.util.Map; -import java.util.Map.Entry; import java.util.regex.Pattern; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.document.AbstractParser; -import net.yacy.document.Classification; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ContentScraper; -import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.util.FileUtils; @@ -96,78 +91,14 @@ public class htmlParser extends AbstractParser implements Parser { try { // first get a document from the parsed html - ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream); - Document document = transformScraper(location, mimeType, documentCharset, scraper); - - // then produce virtual documents for each of the link that is contained in the document! - ArrayList docs = new ArrayList(); - docs.add(document); - for (Map.Entry link: document.getApplinks().entrySet()) { - addLinkDocs(docs, "application", link.getKey(), link.getValue(), scraper); - } - for (Map.Entry link: document.getAudiolinks().entrySet()) { - addLinkDocs(docs, "audio", link.getKey(), link.getValue(), scraper); - } - for (Map.Entry link: document.getVideolinks().entrySet()) { - addLinkDocs(docs, "video", link.getKey(), link.getValue(), scraper); - } - for (Entry link: document.getImages().entrySet()) { - addImageDocs(docs, link.getValue()); - } - - - // finally return the list of documents - return docs.toArray(new Document[docs.size()]); + final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream); + final Document document = transformScraper(location, mimeType, documentCharset, scraper); + + return new Document[]{document}; } catch (final IOException e) { throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location); } } - - private final static void addLinkDocs(ArrayList docs, String type, MultiProtocolURI uri, String descr, ContentScraper scraper) { - //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr); - final Document doc = new Document( - uri, - Classification.ext2mime(uri.getFileExtension()), - "UTF-8", - null, - scraper.getContentLanguages(), - null, - descr, - "", - "", - new String[]{descr}, - type, - 0.0f, 0.0f, - uri.toNormalform(false, false), - null, - null, - null, - false); - docs.add(doc); - } - - private final static void addImageDocs(ArrayList docs, ImageEntry img) { - //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt()); - final Document doc = new Document( - img.url(), - Classification.ext2mime(img.url().getFileExtension()), - "UTF-8", - null, - null, - null, - img.alt(), - "", - "", - new String[]{img.alt()}, - "image", - 0.0f, 0.0f, - img.url().toNormalform(false, false), - null, - null, - null, - false); - docs.add(doc); - } /** * the transformScraper method transforms a scraper object into a document object @@ -211,7 +142,7 @@ public class htmlParser extends AbstractParser implements Parser { scraper.indexingDenied()); //scraper.close(); ppd.setFavicon(scraper.getFavicon()); - + return ppd; } diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index e4d0d1d2c..d42c625c9 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -1,10 +1,10 @@ -// sevenzipParser.java +// sevenzipParser.java // ------------------------------------- // part of YACY // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de // Frankfurt, Germany, 2004 -// +// // This file ist contributed by Franz Brausze // // $LastChangedDate$ @@ -15,12 +15,12 @@ // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. -// +// // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. -// +// // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -40,7 +40,6 @@ import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; - import SevenZip.ArchiveExtractCallback; import SevenZip.IInStream; import SevenZip.Archive.IInArchive; @@ -48,13 +47,13 @@ import SevenZip.Archive.SevenZipEntry; import SevenZip.Archive.SevenZip.Handler; public class sevenzipParser extends AbstractParser implements Parser { - + public sevenzipParser() { super("7zip Archive Parser"); - SUPPORTED_EXTENSIONS.add("7z"); - SUPPORTED_MIME_TYPES.add("application/x-7z-compressed"); + this.SUPPORTED_EXTENSIONS.add("7z"); + this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed"); } - + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException { final Document doc = new Document( location, @@ -68,7 +67,7 @@ public class sevenzipParser extends AbstractParser implements Parser { null, null, null, - 0.0f, 0.0f, + 0.0f, 0.0f, (Object)null, null, null, @@ -86,7 +85,7 @@ public class sevenzipParser extends AbstractParser implements Parser { super.log.logFine("processing archive contents..."); try { archive.Extract(null, -1, 0, aec); - return doc; + return doc; } catch (final IOException e) { if (e.getCause() instanceof InterruptedException) throw (InterruptedException)e.getCause(); @@ -99,7 +98,7 @@ public class sevenzipParser extends AbstractParser implements Parser { try { archive.close(); } catch (final IOException e) { } } } - + public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { @@ -114,12 +113,12 @@ public class sevenzipParser extends AbstractParser implements Parser { // wrapper class to redirect output of standard ArchiveExtractCallback to serverLog // and parse the extracted content private static class SZParserExtractCallback extends ArchiveExtractCallback { - + private final Log log; private ByteArrayOutputStream cfos = null; private final Document doc; private final String prefix; - + public SZParserExtractCallback(final Log logger, final IInArchive handler, final Document doc, final String prefix) { super.Init(handler); @@ -127,7 +126,7 @@ public class sevenzipParser extends AbstractParser implements Parser { this.doc = doc; this.prefix = prefix; } - + @Override public void PrepareOperation(final int arg0) { this.extractMode = (arg0 == IInArchive.NExtract_NAskMode_kExtract); @@ -143,7 +142,7 @@ public class sevenzipParser extends AbstractParser implements Parser { break; } } - + @Override public void SetOperationResult(final int arg0) throws IOException { if (arg0 != IInArchive.NExtract_NOperationResult_kOK) { @@ -159,16 +158,16 @@ public class sevenzipParser extends AbstractParser implements Parser { // throw new IOException("Unknown Error"); } } else try { - + if (this.cfos != null) { // parse the file Document[] theDocs; // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects - final MultiProtocolURI url = MultiProtocolURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath); + final MultiProtocolURI url = MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); - theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); - + theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray(), false); + this.doc.addSubDocuments(theDocs); } } catch (final Exception e) { @@ -177,7 +176,7 @@ public class sevenzipParser extends AbstractParser implements Parser { throw ex; } } - + @Override public OutputStream GetStream(final int index, final int askExtractMode) throws IOException { final SevenZipEntry item = super.archiveHandler.getEntry(index); @@ -185,10 +184,10 @@ public class sevenzipParser extends AbstractParser implements Parser { this.cfos = (item.isDirectory()) ? null : new ByteArrayOutputStream(); return this.cfos; } - + public String getCurrentFilePath() { return super.filePath; } } - + } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index 40d3cdd29..de59b926c 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -11,12 +11,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -45,31 +45,31 @@ import org.apache.tools.tar.TarInputStream; public class tarParser extends AbstractParser implements Parser { - public tarParser() { - super("Tape Archive File Parser"); - SUPPORTED_EXTENSIONS.add("tar"); - SUPPORTED_MIME_TYPES.add("application/x-tar"); - SUPPORTED_MIME_TYPES.add("application/tar"); - SUPPORTED_MIME_TYPES.add("applicaton/x-gtar"); - SUPPORTED_MIME_TYPES.add("multipart/x-tar"); + public tarParser() { + super("Tape Archive File Parser"); + this.SUPPORTED_EXTENSIONS.add("tar"); + this.SUPPORTED_MIME_TYPES.add("application/x-tar"); + this.SUPPORTED_MIME_TYPES.add("application/tar"); + this.SUPPORTED_MIME_TYPES.add("applicaton/x-gtar"); + this.SUPPORTED_MIME_TYPES.add("multipart/x-tar"); } - + public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { - + final List docacc = new ArrayList(); Document[] subDocs = null; final String ext = url.getFileExtension().toLowerCase(); if (ext.equals("gz") || ext.equals("tgz")) { try { source = new GZIPInputStream(source); - } catch (IOException e) { + } catch (final IOException e) { throw new Parser.Failure("tar parser: " + e.getMessage(), url); } } TarEntry entry; - final TarInputStream tis = new TarInputStream(source); + final TarInputStream tis = new TarInputStream(source); File tmp = null; - + // loop through the elements in the tar file and parse every single file inside while (true) { try { @@ -83,16 +83,16 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp); + subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp, false); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { - log.logWarning("tar parser entry " + name + ": " + e.getMessage()); + this.log.logWarning("tar parser entry " + name + ": " + e.getMessage()); } finally { if (tmp != null) FileUtils.deletedelete(tmp); } - } catch (IOException e) { - log.logWarning("tar parser:" + e.getMessage()); + } catch (final IOException e) { + this.log.logWarning("tar parser:" + e.getMessage()); break; } } diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index df1d4875c..b216fe099 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -87,7 +87,7 @@ public class zipParser extends AbstractParser implements Parser { FileUtils.copy(zis, tmp, entry.getSize()); final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); - docs = TextParser.parseSource(virtualURL, mime, null, tmp); + docs = TextParser.parseSource(virtualURL, mime, null, tmp, false); if (docs == null) continue; for (final Document d: docs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 330579aad..8bcff3335 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -367,7 +367,7 @@ public final class LoaderDispatcher { final String supportError = TextParser.supports(url, responseHeader.mime()); if (supportError != null) throw new IOException("no parser support: " + supportError); try { - documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent()); + documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent(), false); if (documents == null) throw new IOException("document == null"); } catch (final Exception e) { throw new IOException("parser error: " + e.getMessage()); diff --git a/test/de/anomic/document/ParserTest.java b/test/de/anomic/document/ParserTest.java index 61901fcb6..b26529b53 100644 --- a/test/de/anomic/document/ParserTest.java +++ b/test/de/anomic/document/ParserTest.java @@ -1,29 +1,30 @@ package de.anomic.document; -import static org.junit.Assert.*; -import static org.junit.matchers.JUnitMatchers.*; -import net.yacy.document.Document; -import net.yacy.document.Parser; -import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; - -import org.junit.Test; +import static org.junit.Assert.assertThat; +import static org.junit.matchers.JUnitMatchers.containsString; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; -import java.io.Reader; import java.io.InputStreamReader; +import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.TextParser; +import net.yacy.kelondro.data.meta.DigestURI; + +import org.junit.Test; + public class ParserTest { @Test public void testParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException { - String[][] testFiles = new String[][] { - // meaning: filename in test/parsertest, mimetype, title, creator, description, + final String[][] testFiles = new String[][] { + // meaning: filename in test/parsertest, mimetype, title, creator, description, new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""}, new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""}, new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"}, @@ -34,26 +35,26 @@ public class ParserTest { }; - for (int i=0; i < testFiles.length; i++) { - String filename = "test/parsertest/" + testFiles[i][0]; - File file = new File(filename); - String mimetype = testFiles[i][1]; - DigestURI url = new DigestURI("http://localhost/"+filename); + for (final String[] testFile : testFiles) { + final String filename = "test/parsertest/" + testFile[0]; + final File file = new File(filename); + final String mimetype = testFile[1]; + final DigestURI url = new DigestURI("http://localhost/"+filename); - Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file)); - for (Document doc: docs) { - Reader content = new InputStreamReader(doc.getText(), doc.getCharset()); - StringBuilder str = new StringBuilder(); + final Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file), true); + for (final Document doc: docs) { + final Reader content = new InputStreamReader(doc.getText(), doc.getCharset()); + final StringBuilder str = new StringBuilder(); int c; while( (c = content.read()) != -1 ) str.append((char)c); - + System.out.println("Parsed " + filename + ": " + str); assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen")); - assertThat(doc.dc_title(), containsString(testFiles[i][2])); - assertThat(doc.dc_creator(), containsString(testFiles[i][3])); - assertThat(doc.dc_description(), containsString(testFiles[i][4])); - } + assertThat(doc.dc_title(), containsString(testFile[2])); + assertThat(doc.dc_creator(), containsString(testFile[3])); + assertThat(doc.dc_description(), containsString(testFile[4])); + } } } }