From 49e5ca579f8888a93859da39656ca51ff1fe3166 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 7 Sep 2011 10:08:57 +0000 Subject: [PATCH] added new configuration property "crawler.embedLinksAsDocuments". If this is switched on (this is default now), the all embedded image, audio and video links from all parsed documents are added to the search index as individual document. This will increase the search index size dramatically but will also enable us to create a much faster image, audio and video search. If the flag is switched on, the index entries are also stored to a solr index, if this is also enabled. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7931 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 4 + .../de/anomic/crawler/retrieval/Response.java | 238 ++++----- source/de/anomic/search/DocumentIndex.java | 142 ++--- source/de/anomic/search/Switchboard.java | 2 +- source/net/yacy/document/AbstractParser.java | 26 +- source/net/yacy/document/Document.java | 4 + source/net/yacy/document/TextParser.java | 98 +++- .../document/importer/MediawikiImporter.java | 486 +++++++++--------- .../net/yacy/document/parser/bzipParser.java | 48 +- .../net/yacy/document/parser/gzipParser.java | 46 +- .../net/yacy/document/parser/htmlParser.java | 79 +-- .../yacy/document/parser/sevenzipParser.java | 45 +- .../net/yacy/document/parser/tarParser.java | 36 +- .../net/yacy/document/parser/zipParser.java | 2 +- .../net/yacy/repository/LoaderDispatcher.java | 2 +- test/de/anomic/document/ParserTest.java | 51 +- 16 files changed, 670 insertions(+), 639 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 44f45b7d1..eed964a98 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -704,6 +704,10 @@ crawler.file.maxFileSize=100000000 # maximum number of crawler threads crawler.MaxActiveThreads = 200 +# flag: consider all embedded image/audio/video document links +# from all crawled documents as its own document +crawler.embedLinksAsDocuments = true + # maximum size of indexing queue indexer.slots = 100 diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 8fd3349d2..d36851e36 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -43,7 +43,7 @@ import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.ResultURLs.EventOrigin; public class Response { - + // doctypes: public static final char DT_PDFPS = 'p'; public static final char DT_TEXT = 't'; @@ -65,7 +65,7 @@ public class Response { private final CrawlProfile profile; private byte[] content; private int status; // tracker indexing status, see status defs below - + // doctype calculation public static char docType(final DigestURI url) { final String path = url.getPath().toLowerCase(); @@ -136,14 +136,14 @@ public class Response { //zip = application/zip return doctype; } - + public static final int QUEUE_STATE_FRESH = 0; public static final int QUEUE_STATE_PARSING = 1; public static final int QUEUE_STATE_CONDENSING = 2; public static final int QUEUE_STATE_STRUCTUREANALYSIS = 3; public static final int QUEUE_STATE_INDEXSTORAGE = 4; public static final int QUEUE_STATE_FINISHED = 5; - + public Response( final Request request, final RequestHeader requestHeader, @@ -160,7 +160,7 @@ public class Response { this.status = QUEUE_STATE_FRESH; this.content = content; } - + public Response(final Request request, final CrawlProfile profile) { this.request = request; // request and response headers may be zero in case that we process surrogates @@ -172,7 +172,7 @@ public class Response { this.status = QUEUE_STATE_FRESH; this.content = request.url().toTokens().getBytes(); } - + public Response( final Request request, final RequestHeader requestHeader, @@ -185,15 +185,15 @@ public class Response { public void updateStatus(final int newStatus) { this.status = newStatus; } - + public ResponseHeader getResponseHeader() { return this.responseHeader; } - + public int getStatus() { return this.status; } - + public String name() { // the anchor name; can be either the text inside the anchor tag or the // page description after loading of the page @@ -203,7 +203,7 @@ public class Response { public DigestURI url() { return this.request.url(); } - + public char docType() { char doctype = docType(getMimeType()); if (doctype == DT_UNKNOWN) doctype = docType(url()); @@ -212,21 +212,21 @@ public class Response { public Date lastModified() { Date docDate = null; - - if (responseHeader != null) { - docDate = responseHeader.lastModified(); - if (docDate == null) docDate = responseHeader.date(); + + if (this.responseHeader != null) { + docDate = this.responseHeader.lastModified(); + if (docDate == null) docDate = this.responseHeader.date(); } - if (docDate == null && request != null) docDate = request.appdate(); - if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime()); - + if (docDate == null && this.request != null) docDate = this.request.appdate(); + if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime()); + return docDate; } - + public String language() { // please avoid this method if a condenser document is available, because the condenser has a built-in language detection // this here is only a guess using the TLD - return this.url().language(); + return url().language(); } public CrawlProfile profile() { @@ -272,9 +272,9 @@ public class Response { */ public String shallStoreCacheForProxy() { - String crawlerReason = shallStoreCacheForCrawler(); + final String crawlerReason = shallStoreCacheForCrawler(); if (crawlerReason != null) return crawlerReason; - + // check profile (disabled: we will check this in the plasmaSwitchboard) // if (!this.profile.storeHTCache()) { return "storage_not_wanted"; } @@ -285,19 +285,19 @@ public class Response { // -CGI access in request // CGI access makes the page very individual, and therefore not usable // in caches - if (this.url().isPOST() && this.profile != null && !this.profile.crawlingQ()) { + if (url().isPOST() && this.profile != null && !this.profile.crawlingQ()) { return "dynamic_post"; } - - if (this.url().isCGI()) { + + if (url().isCGI()) { return "dynamic_cgi"; } - - if (this.url().isLocal()) { + + if (url().isLocal()) { return "local_URL_no_cache_needed"; } - - if (responseHeader != null) { + + if (this.responseHeader != null) { // -if-modified-since in request // we do not care about if-modified-since, because this case only occurres if the @@ -315,7 +315,7 @@ public class Response { // -pragma in response // if we have a pragma non-cache, we don't cache. usually if this is wanted from // the server, it makes sense - String cacheControl = responseHeader.get(HeaderFramework.PRAGMA); + String cacheControl = this.responseHeader.get(HeaderFramework.PRAGMA); if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; } // -expires in response @@ -324,12 +324,12 @@ public class Response { // -cache-control in response // the cache-control has many value options. - cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL); + cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL); if (cacheControl != null) { cacheControl = cacheControl.trim().toUpperCase(); if (cacheControl.startsWith("MAX-AGE=")) { // we need also the load date - final Date date = responseHeader.date(); + final Date date = this.responseHeader.date(); if (date == null) return "stale_no_date_given_in_response"; try { final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live @@ -349,35 +349,35 @@ public class Response { public String shallStoreCacheForCrawler() { // check storage size: all files will be handled in RAM before storage, so they must not exceed // a given size, which we consider as 1MB - if (this.size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + this.size(); - + if (size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + size(); + // check status code if (!validResponseStatus()) { return "bad_status_" + this.responseStatus; } - if (requestHeader != null) { + if (this.requestHeader != null) { // -authorization cases in request // authorization makes pages very individual, and therefore we cannot use the // content in the cache - if (requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return "personalized"; } + if (this.requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return "personalized"; } // -ranges in request and response // we do not cache partial content - if (requestHeader.containsKey(HeaderFramework.RANGE)) { return "partial_request"; } + if (this.requestHeader.containsKey(HeaderFramework.RANGE)) { return "partial_request"; } } - - if (responseHeader != null) { + + if (this.responseHeader != null) { // -ranges in request and response - // we do not cache partial content - if (responseHeader.containsKey(HeaderFramework.CONTENT_RANGE)) { return "partial_response"; } + // we do not cache partial content + if (this.responseHeader.containsKey(HeaderFramework.CONTENT_RANGE)) { return "partial_response"; } } return null; } - + /** * decide upon header information if a specific file should be taken from * the cache or not - * + * * @return whether the file should be taken from the cache */ public boolean isFreshForProxy() { @@ -385,27 +385,27 @@ public class Response { // -CGI access in request // CGI access makes the page very individual, and therefore not usable // in caches - if (this.url().isPOST()) { + if (url().isPOST()) { return false; } - if (this.url().isCGI()) { + if (url().isCGI()) { return false; } String cacheControl; - if (requestHeader != null) { + if (this.requestHeader != null) { // -authorization cases in request - if (requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return false; } + if (this.requestHeader.containsKey(RequestHeader.AUTHORIZATION)) { return false; } // -ranges in request // we do not cache partial content - if (requestHeader.containsKey(HeaderFramework.RANGE)) { return false; } + if (this.requestHeader.containsKey(HeaderFramework.RANGE)) { return false; } // if the client requests a un-cached copy of the resource ... - cacheControl = requestHeader.get(HeaderFramework.PRAGMA); + cacheControl = this.requestHeader.get(HeaderFramework.PRAGMA); if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; } - cacheControl = requestHeader.get(HeaderFramework.CACHE_CONTROL); + cacheControl = this.requestHeader.get(HeaderFramework.CACHE_CONTROL); if (cacheControl != null) { cacheControl = cacheControl.trim().toUpperCase(); if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; } @@ -414,14 +414,14 @@ public class Response { // -if-modified-since in request // The entity has to be transferred only if it has // been modified since the date given by the If-Modified-Since header. - if (requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) { + if (this.requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) { // checking this makes only sense if the cached response contains // a Last-Modified field. If the field does not exist, we go the safe way - if (!responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; } + if (!this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; } // parse date Date d1, d2; - d2 = responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); } - d1 = requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); } + d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); } + d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); } // finally, we shall treat the cache as stale if the modification time is after the if-.. time if (d2.after(d1)) { return false; } } @@ -433,48 +433,48 @@ public class Response { // but we think that pictures can still be considered as fresh // -set-cookie in cached response // this is a similar case as for COOKIE. - if (requestHeader.containsKey(RequestHeader.COOKIE) || - responseHeader.containsKey(HeaderFramework.SET_COOKIE) || - responseHeader.containsKey(HeaderFramework.SET_COOKIE2)) { + if (this.requestHeader.containsKey(RequestHeader.COOKIE) || + this.responseHeader.containsKey(HeaderFramework.SET_COOKIE) || + this.responseHeader.containsKey(HeaderFramework.SET_COOKIE2)) { return false; // too strong } } } - if (responseHeader != null) { + if (this.responseHeader != null) { // -pragma in cached response // logically, we would not need to care about no-cache pragmas in cached response headers, // because they cannot exist since they are not written to the cache. // So this IF should always fail.. - cacheControl = responseHeader.get(HeaderFramework.PRAGMA); + cacheControl = this.responseHeader.get(HeaderFramework.PRAGMA); if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; } - + // see for documentation also: // http://www.web-caching.com/cacheability.html // http://vancouver-webpages.com/CacheNow/ - + // look for freshnes information // if we don't have any freshnes indication, we treat the file as stale. // no handle for freshness control: - + // -expires in cached response // the expires value gives us a very easy hint when the cache is stale - final Date expires = responseHeader.expires(); + final Date expires = this.responseHeader.expires(); if (expires != null) { // System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url); if (expires.before(new Date(GenericFormatter.correctedUTCTime()))) { return false; } } - final Date lastModified = responseHeader.lastModified(); - cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL); + final Date lastModified = this.responseHeader.lastModified(); + cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL); if (cacheControl == null && lastModified == null && expires == null) { return false; } - + // -lastModified in cached response // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read // of the file and the last modified date as the age of the file. If we consider the file as // middel-aged then, the maximum TTL would be cache-creation plus age. // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache // file may only be treated as fresh for one more month, not more. - Date date = responseHeader.date(); + Date date = this.responseHeader.date(); if (lastModified != null) { if (date == null) { date = new Date(GenericFormatter.correctedUTCTime()); } final long age = date.getTime() - lastModified.getTime(); @@ -484,7 +484,7 @@ public class Response { // therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10 if (GenericFormatter.correctedUTCTime() - date.getTime() > age / 10) { return false; } } - + // -cache-control in cached response // the cache-control has many value options. if (cacheControl != null) { @@ -510,17 +510,17 @@ public class Response { } } } - + return true; } - + /** * decide upon header information if a specific file should be indexed * this method returns null if the answer is 'YES'! * if the answer is 'NO' (do not index), it returns a string with the reason * to reject the crawling demand in clear text - * + * * This function is used by plasmaSwitchboard#processResourceStack */ public final String shallIndexCacheForProxy() { @@ -530,7 +530,7 @@ public class Response { // check profile if (!profile().indexText() && !profile().indexMedia()) { - return "indexing not allowed - indexText and indexMedia not set (for proxy = " + profile.name()+ ")"; + return "indexing not allowed - indexText and indexMedia not set (for proxy = " + this.profile.name()+ ")"; } // -CGI access in request @@ -556,7 +556,7 @@ public class Response { return "Media_Content_(forbidden)"; } */ - + // -cookies in request // unfortunately, we cannot index pages which have been requested with a cookie // because the returned content may be special for the client @@ -565,19 +565,19 @@ public class Response { return "Dynamic_(Requested_With_Cookie)"; } - if (responseHeader != null) { + if (this.responseHeader != null) { // -set-cookie in response // the set-cookie from the server does not indicate that the content is special - // thus we do not care about it here for indexing - + // thus we do not care about it here for indexing + // a picture cannot be indexed - final String mimeType = responseHeader.mime(); + final String mimeType = this.responseHeader.mime(); /* if (Classification.isPictureMime(mimeType)) { return "Media_Content_(Picture)"; } */ - String parserError = TextParser.supportsMime(mimeType); + final String parserError = TextParser.supportsMime(mimeType); if (parserError != null) { return "Media_Content, no parser: " + parserError; } @@ -585,9 +585,9 @@ public class Response { // -if-modified-since in request // if the page is fresh at the very moment we can index it final Date ifModifiedSince = this.requestHeader.ifModifiedSince(); - if ((ifModifiedSince != null) && (responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) { + if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) { // parse date - Date d = responseHeader.lastModified(); + Date d = this.responseHeader.lastModified(); if (d == null) { d = new Date(GenericFormatter.correctedUTCTime()); } @@ -599,8 +599,8 @@ public class Response { } // -pragma in cached response - if (responseHeader.containsKey(HeaderFramework.PRAGMA) && - (responseHeader.get(HeaderFramework.PRAGMA)).toUpperCase().equals("NO-CACHE")) { + if (this.responseHeader.containsKey(HeaderFramework.PRAGMA) && + (this.responseHeader.get(HeaderFramework.PRAGMA)).toUpperCase().equals("NO-CACHE")) { return "Denied_(pragma_no_cache)"; } @@ -613,7 +613,7 @@ public class Response { // the expires value gives us a very easy hint when the cache is stale // sometimes, the expires date is set to the past to prevent that a page is cached // we use that information to see if we should index it - final Date expires = responseHeader.expires(); + final Date expires = this.responseHeader.expires(); if (expires != null && expires.before(new Date(GenericFormatter.correctedUTCTime()))) { return "Stale_(Expired)"; } @@ -624,7 +624,7 @@ public class Response { // -cache-control in cached response // the cache-control has many value options. - String cacheControl = responseHeader.get(HeaderFramework.CACHE_CONTROL); + String cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL); if (cacheControl != null) { cacheControl = cacheControl.trim().toUpperCase(); /* we have the following cases for cache-control: @@ -641,7 +641,7 @@ public class Response { // // ok, do nothing } else if (cacheControl.startsWith("MAX-AGE=")) { // we need also the load date - final Date date = responseHeader.date(); + final Date date = this.responseHeader.date(); if (date == null) { return "Stale_(no_date_given_in_response)"; } @@ -675,7 +675,7 @@ public class Response { // check profile if (!profile().indexText() && !profile().indexMedia()) { - return "indexing not allowed - indexText and indexMedia not set (for crawler = " + profile.name() + ")"; + return "indexing not allowed - indexText and indexMedia not set (for crawler = " + this.profile.name() + ")"; } // -CGI access in request @@ -692,9 +692,9 @@ public class Response { // we checked that in shallStoreCache // check if document can be indexed - if (responseHeader != null) { - final String mimeType = responseHeader.mime(); - String parserError = TextParser.supportsMime(mimeType); + if (this.responseHeader != null) { + final String mimeType = this.responseHeader.mime(); + final String parserError = TextParser.supportsMime(mimeType); if (parserError != null && TextParser.supportsExtension(url()) != null) return "no parser available: " + parserError; } /* @@ -703,7 +703,7 @@ public class Response { return "Media_Content_(forbidden)"; } */ - + // -if-modified-since in request // if the page is fresh at the very moment we can index it // -> this does not apply for the crawler @@ -739,36 +739,36 @@ public class Response { return null; } - + public String getMimeType() { - if (responseHeader == null) return null; - - String mimeType = responseHeader.mime(); + if (this.responseHeader == null) return null; + + String mimeType = this.responseHeader.mime(); mimeType = mimeType.trim().toLowerCase(); - + final int pos = mimeType.indexOf(';'); - return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); + return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); } - + public String getCharacterEncoding() { - if (responseHeader == null) return null; - return responseHeader.getCharacterEncoding(); + if (this.responseHeader == null) return null; + return this.responseHeader.getCharacterEncoding(); } - + public DigestURI referrerURL() { - if (requestHeader == null) return null; + if (this.requestHeader == null) return null; try { - String r = requestHeader.get(RequestHeader.REFERER, null); + final String r = this.requestHeader.get(RequestHeader.REFERER, null); if (r == null) return null; return new DigestURI(r); } catch (final Exception e) { return null; } } - + public byte[] referrerHash() { - if (requestHeader == null) return null; - String u = requestHeader.get(RequestHeader.REFERER, ""); + if (this.requestHeader == null) return null; + final String u = this.requestHeader.get(RequestHeader.REFERER, ""); if (u == null || u.length() == 0) return null; try { return new DigestURI(u).hash(); @@ -776,27 +776,27 @@ public class Response { return null; } } - + public boolean validResponseStatus() { - return (responseStatus == null) ? false : responseStatus.startsWith("200") || responseStatus.startsWith("203"); + return (this.responseStatus == null) ? false : this.responseStatus.startsWith("200") || this.responseStatus.startsWith("203"); } public Date ifModifiedSince() { - return (requestHeader == null) ? null : requestHeader.ifModifiedSince(); + return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince(); } public boolean requestWithCookie() { - return (requestHeader == null) ? false : requestHeader.containsKey(RequestHeader.COOKIE); + return (this.requestHeader == null) ? false : this.requestHeader.containsKey(RequestHeader.COOKIE); } public boolean requestProhibitsIndexing() { - return (requestHeader == null) - ? false - : requestHeader.containsKey(HeaderFramework.X_YACY_INDEX_CONTROL) && - (requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"); + return (this.requestHeader == null) + ? false + : this.requestHeader.containsKey(HeaderFramework.X_YACY_INDEX_CONTROL) && + (this.requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"); } - - public EventOrigin processCase(String mySeedHash) { + + public EventOrigin processCase(final String mySeedHash) { // we must distinguish the following cases: resource-load was initiated by // 1) global crawling: the index is extern, not here (not possible here) // 2) result of search queries, some indexes are here (not possible here) @@ -818,13 +818,13 @@ public class Response { } return processCase; } - + public Document[] parse() throws Parser.Failure { - String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); + final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); try { - return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content); - } catch (Exception e) { + return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content, false); + } catch (final Exception e) { return null; } diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java index 7394e2c85..cade93eb6 100644 --- a/source/de/anomic/search/DocumentIndex.java +++ b/source/de/anomic/search/DocumentIndex.java @@ -35,7 +35,6 @@ import java.util.Date; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; - import net.yacy.cora.document.UTF8; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -53,26 +52,26 @@ import net.yacy.kelondro.logging.Log; * */ public class DocumentIndex extends Segment { - + private static final RankingProfile textRankingDefault = new RankingProfile(ContentDomain.TEXT); //private Bitfield zeroConstraint = new Bitfield(4); - + private static DigestURI poison; static { try { poison = new DigestURI("file://."); - } catch (MalformedURLException e) {} + } catch (final MalformedURLException e) {} } BlockingQueue queue; // a queue of document ID's - private Worker[] worker; + private final Worker[] worker; CallbackListener callback; static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup"); - - - public DocumentIndex(final File segmentPath, CallbackListener callback, int cachesize) throws IOException { + + + public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) throws IOException { super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false); - int cores = Runtime.getRuntime().availableProcessors() + 1; + final int cores = Runtime.getRuntime().availableProcessors() + 1; this.callback = callback; this.queue = new LinkedBlockingQueue(cores * 300); this.worker = new Worker[cores]; @@ -81,46 +80,48 @@ public class DocumentIndex extends Segment { this.worker[i].start(); } } - + class Worker extends Thread { - public Worker(int count) { + public Worker(final int count) { super(workerThreadGroup, "query-" + count); } - + @Override public void run() { DigestURI f; - URIMetadataRow resultRow; + URIMetadataRow[] resultRows; try { - while ((f = queue.take()) != poison) try { - resultRow = add(f); - if (callback != null) { - if (resultRow == null) { - callback.fail(f, "result is null"); - } else { - callback.commit(f, resultRow); + while ((f = DocumentIndex.this.queue.take()) != poison) try { + resultRows = add(f); + for (final URIMetadataRow resultRow: resultRows) { + if (DocumentIndex.this.callback != null) { + if (resultRow == null) { + DocumentIndex.this.callback.fail(f, "result is null"); + } else { + DocumentIndex.this.callback.commit(f, resultRow); + } } } - } catch (IOException e) { + } catch (final IOException e) { if (e.getMessage().indexOf("cannot parse") < 0) Log.logException(e); - callback.fail(f, e.getMessage()); + DocumentIndex.this.callback.fail(f, e.getMessage()); } - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} } } - + /** * get the number of pending documents in the indexing queue */ public int pending() { return this.queue.size(); } - + public void clearQueue() { this.queue.clear(); } - - private URIMetadataRow add(DigestURI url) throws IOException { + + private URIMetadataRow[] add(final DigestURI url) throws IOException { if (url == null) throw new IOException("file = null"); if (url.isDirectory()) throw new IOException("file should be a document, not a path"); if (!url.canRead()) throw new IOException("cannot read file"); @@ -128,17 +129,20 @@ public class DocumentIndex extends Segment { long length; try { length = url.length(); - } catch (Exception e) { + } catch (final Exception e) { length = -1; } try { - documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1)); - } catch (Exception e) { + documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1), true); + } catch (final Exception e) { throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } - Document document = Document.mergeDocuments(url, null, documents); - final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib); - return super.storeDocument( + //Document document = Document.mergeDocuments(url, null, documents); + final URIMetadataRow[] rows = new URIMetadataRow[documents.length]; + int c = 0; + for (final Document document: documents) { + final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib); + rows[c++] = super.storeDocument( url, null, new Date(url.lastModified()), @@ -149,25 +153,27 @@ public class DocumentIndex extends Segment { null, DocumentIndex.class.getName() + ".add" ); + } + return rows; } - + /** * add a file or a directory of files to the index * If the given file is a path to a directory, the complete sub-tree is indexed * @param start */ - public void addConcurrent(DigestURI start) throws IOException { + public void addConcurrent(final DigestURI start) throws IOException { assert (start != null); assert (start.canRead()) : start.toString(); if (!start.isDirectory()) { try { this.queue.put(start); - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} return; } - String[] s = start.list(); + final String[] s = start.list(); DigestURI w; - for (String t: s) { + for (final String t: s) { try { w = new DigestURI(start, t); if (w.canRead() && !w.isHidden()) { @@ -176,31 +182,31 @@ public class DocumentIndex extends Segment { } else { try { this.queue.put(w); - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} } } - } catch (MalformedURLException e1) { + } catch (final MalformedURLException e1) { Log.logException(e1); } } } - + /** * do a full-text search of a given string and return a specific number of results * @param querystring * @param count * @return a list of files that contain the given string - */ - public ArrayList find(String querystring, int count) { + */ + public ArrayList find(final String querystring, int count) { // make a query and start a search - QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex"); - ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang)); - RankingProcess rankedCache = new RankingProcess(query, order, SearchEvent.max_results_preparation); + final QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex"); + final ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang)); + final RankingProcess rankedCache = new RankingProcess(query, order, SearchEvent.max_results_preparation); rankedCache.start(); - + // search is running; retrieve results URIMetadataRow row; - ArrayList files = new ArrayList(); + final ArrayList files = new ArrayList(); Components metadata; while ((row = rankedCache.takeURL(false, 1000)) != null) { metadata = row.metadata(); @@ -211,7 +217,7 @@ public class DocumentIndex extends Segment { } return files; } - + /** * close the index. * This terminates all worker threads and then closes the segment. @@ -219,27 +225,27 @@ public class DocumentIndex extends Segment { @Override public void close() { // send termination signal to worker threads - for (int i = 0; i < this.worker.length; i++) { + for (final Worker element : this.worker) { try { this.queue.put(poison); - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} } // wait for termination - for (int i = 0; i < this.worker.length; i++) { + for (final Worker element : this.worker) { try { - this.worker[i].join(); - } catch (InterruptedException e) {} + element.join(); + } catch (final InterruptedException e) {} } // close the segment super.close(); } - + public interface CallbackListener { public void commit(DigestURI f, URIMetadataRow resultRow); public void fail(DigestURI f, String failReason); } - - public static void main(String[] args) { + + public static void main(final String[] args) { // first argument: path to segment // second argument: either 'add' or 'search' // third and more arguments exists only in case that second argument is 'search': these are then the search words @@ -249,37 +255,37 @@ public class DocumentIndex extends Segment { // DocumentIndex yacyindex search steht System.setProperty("java.awt.headless", "true"); if (args.length < 3) return; - File segmentPath = new File(args[0]); + final File segmentPath = new File(args[0]); System.out.println("using index files at " + segmentPath.getAbsolutePath()); - CallbackListener callback = new CallbackListener() { - public void commit(DigestURI f, URIMetadataRow resultRow) { + final CallbackListener callback = new CallbackListener() { + public void commit(final DigestURI f, final URIMetadataRow resultRow) { System.out.println("indexed: " + f.toString()); } - public void fail(DigestURI f, String failReason) { + public void fail(final DigestURI f, final String failReason) { System.out.println("not indexed " + f.toString() + ": " + failReason); } }; try { if (args[1].equals("add")) { - DigestURI f = new DigestURI(args[2]); - DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); + final DigestURI f = new DigestURI(args[2]); + final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); di.addConcurrent(f); di.close(); } else { String query = ""; for (int i = 2; i < args.length; i++) query += args[i]; query.trim(); - DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); - ArrayList results = di.find(query, 100); - for (DigestURI f: results) { + final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); + final ArrayList results = di.find(query, 100); + for (final DigestURI f: results) { if (f != null) System.out.println(f.toString()); } di.close(); } - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } //System.exit(0); } - + } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 48f63e0be..e5ea8f545 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1921,7 +1921,7 @@ public final class Switchboard extends serverSwitch { assert response.getContent() != null; try { // parse the document - documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent()); + documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), getConfigBool("crawler.embedLinksAsDocuments", false)); if (documents == null) { throw new Parser.Failure("Parser returned null.", response.url()); } diff --git a/source/net/yacy/document/AbstractParser.java b/source/net/yacy/document/AbstractParser.java index e168ff07a..3c82f2e01 100644 --- a/source/net/yacy/document/AbstractParser.java +++ b/source/net/yacy/document/AbstractParser.java @@ -7,12 +7,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -29,27 +29,27 @@ import java.util.Set; import net.yacy.kelondro.logging.Log; public abstract class AbstractParser implements Parser { - + protected final Log log = new Log("PARSER"); protected final Set SUPPORTED_MIME_TYPES = new HashSet(); protected final Set SUPPORTED_EXTENSIONS = new HashSet(); private final String name; - + /** * initialize a parser with a name * @param name */ - public AbstractParser(String name) { + public AbstractParser(final String name) { this.name = name; } - + /** * return the name of the parser */ public String getName() { return this.name; } - + /** * each parser must define a set of supported mime types * @return a set of mime type strings that are supported @@ -57,7 +57,7 @@ public abstract class AbstractParser implements Parser { public Set supportedMimeTypes() { return this.SUPPORTED_MIME_TYPES; } - + /** * each parser must define a set of supported file extensions * @return a set of file name extensions that are supported @@ -65,22 +65,22 @@ public abstract class AbstractParser implements Parser { public Set supportedExtensions() { return this.SUPPORTED_EXTENSIONS; } - + /** * check equivalence of parsers; this simply tests equality of parser names * @param o * @return */ - public boolean equals(Object o) { - return this.getName().equals(((Parser) o).getName()); + public boolean equals(final Object o) { + return getName().equals(((Parser) o).getName()); } - + /** * the hash code of a parser * @return the hash code of the parser name string */ public int hashCode() { - return this.getName().hashCode(); + return getName().hashCode(); } } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index d334c5a12..666d64358 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -131,6 +131,10 @@ public class Document { return this.parserObject; } + public Set getContentLanguages() { + return this.languages; + } + /** * compute a set of languages that this document contains * the language is not computed using a statistical analysis of the content, only from given metadata that came with the document diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 569293d6a..018982820 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -31,6 +31,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -58,6 +59,7 @@ import net.yacy.document.parser.vcfParser; import net.yacy.document.parser.vsdParser; import net.yacy.document.parser.xlsParser; import net.yacy.document.parser.zipParser; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.images.genericImageParser; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -141,7 +143,8 @@ public final class TextParser { final MultiProtocolURI location, final String mimeType, final String charset, - final File sourceFile + final File sourceFile, + final boolean multipleVirtualDocs ) throws InterruptedException, Parser.Failure { BufferedInputStream sourceStream = null; @@ -154,7 +157,7 @@ public final class TextParser { throw new Parser.Failure(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); - docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream); + docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream, multipleVirtualDocs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -164,6 +167,7 @@ public final class TextParser { if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {} } for (final Document d: docs) { assert d.getText() != null; } // verify docs + return docs; } @@ -171,7 +175,8 @@ public final class TextParser { final MultiProtocolURI location, String mimeType, final String charset, - final byte[] content + final byte[] content, + final boolean multipleVirtualDocs ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array"); mimeType = normalizeMimeType(mimeType); @@ -185,7 +190,12 @@ public final class TextParser { } assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true, false); - return parseSource(location, mimeType, idioms, charset, content); + Document[] docs = parseSource(location, mimeType, idioms, charset, content); + + // finally enrich the docs set with virtual docs from the enclosed documents + if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]); + + return docs; } public static Document[] parseSource( @@ -193,7 +203,8 @@ public final class TextParser { String mimeType, final String charset, final long contentLength, - final InputStream sourceStream + final InputStream sourceStream, + final boolean multipleVirtualDocs ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); mimeType = normalizeMimeType(mimeType); @@ -222,7 +233,12 @@ public final class TextParser { } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } - return parseSource(location, mimeType, idioms, charset, b); + Document[] docs = parseSource(location, mimeType, idioms, charset, b); + + // finally enrich the docs set with virtual docs from the enclosed documents + if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]); + + return docs; } private static Document[] parseSource( @@ -292,6 +308,7 @@ public final class TextParser { } } for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs + return docs; } @@ -429,4 +446,73 @@ public final class TextParser { if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v); } + /** + * produce virtual documents for each of the link that is contained in the document + * @param document + * @return + */ + public static Document[] virtualDocs(final Document document) { + + final ArrayList docs = new ArrayList(); + docs.add(document); + for (final Map.Entry link: document.getApplinks().entrySet()) { + docs.add(genLinkDocs(docs, "application", link.getKey(), link.getValue(), document.getContentLanguages())); + } + for (final Map.Entry link: document.getAudiolinks().entrySet()) { + docs.add(genLinkDocs(docs, "audio", link.getKey(), link.getValue(), document.getContentLanguages())); + } + for (final Map.Entry link: document.getVideolinks().entrySet()) { + docs.add(genLinkDocs(docs, "video", link.getKey(), link.getValue(), document.getContentLanguages())); + } + for (final Entry link: document.getImages().entrySet()) { + docs.add(genImageDocs(docs, link.getValue())); + } + + // finally return the list of documents + return docs.toArray(new Document[docs.size()]); + } + + private final static Document genLinkDocs(final ArrayList docs, final String type, final MultiProtocolURI uri, final String descr, final Set contentLanguages) { + //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr); + return new Document( + uri, + Classification.ext2mime(uri.getFileExtension()), + "UTF-8", + null, + contentLanguages, + null, + descr, + "", + "", + new String[]{descr}, + type, + 0.0f, 0.0f, + uri.toNormalform(false, false), + null, + null, + null, + false); + } + + private final static Document genImageDocs(final ArrayList docs, final ImageEntry img) { + //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt()); + return new Document( + img.url(), + Classification.ext2mime(img.url().getFileExtension()), + "UTF-8", + null, + null, + null, + img.alt(), + "", + "", + new String[]{img.alt()}, + "image", + 0.0f, 0.0f, + img.url().toNormalform(false, false), + null, + null, + null, + false); + } } diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 5ade909da..ef12b49e4 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -2,19 +2,19 @@ * MediawikiImporter * Copyright 2008 by Michael Peter Christen * First released 20.11.2008 at http://yacy.net - * + * * This is a part of YaCy, a peer-to-peer based web search engine * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -22,17 +22,6 @@ package net.yacy.document.importer; -import net.yacy.cora.document.UTF8; -import net.yacy.document.Document; -import net.yacy.document.Parser; -import net.yacy.document.TextParser; -import net.yacy.document.content.SurrogateReader; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.ByteBuffer; - -import org.apache.tools.bzip2.CBZip2InputStream; - import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; @@ -61,6 +50,17 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.zip.GZIPInputStream; +import net.yacy.cora.document.UTF8; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.TextParser; +import net.yacy.document.content.SurrogateReader; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.ByteBuffer; + +import org.apache.tools.bzip2.CBZip2InputStream; + import de.anomic.data.wiki.WikiCode; import de.anomic.data.wiki.WikiParser; @@ -78,9 +78,9 @@ public class MediawikiImporter extends Thread implements Importer { private static final byte[] pagestartb = UTF8.getBytes(pagestart); private static final byte[] pageendb = UTF8.getBytes(pageend); private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 mediawiki dump - + public static Importer job; // if started from a servlet, this object is used to store the thread - + public File sourcefile; public File targetdir; public int count; @@ -88,100 +88,100 @@ public class MediawikiImporter extends Thread implements Importer { private final long docsize; private final int approxdocs; private String hostport, urlStub; - - - public MediawikiImporter(File sourcefile, File targetdir) { + + + public MediawikiImporter(final File sourcefile, final File targetdir) { this.sourcefile = sourcefile; this.docsize = sourcefile.length(); - this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L); + this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L); this.targetdir = targetdir; this.count = 0; this.start = 0; this.hostport = null; this.urlStub = null; } - + public int count() { return this.count; } - + public String source() { return this.sourcefile.getAbsolutePath(); } - + public String status() { return ""; } - + /** * return the number of articles per second * @return */ public int speed() { - if (count == 0) return 0; - return (int) ((long) count / Math.max(1L, runningTime() )); + if (this.count == 0) return 0; + return (int) (this.count / Math.max(1L, runningTime() )); } - + /** * return the remaining seconds for the completion of all records in milliseconds * @return */ public long remainingTime() { - return Math.max(0, this.approxdocs - count) / Math.max(1, speed() ); + return Math.max(0, this.approxdocs - this.count) / Math.max(1, speed() ); } - + public long runningTime() { - return (System.currentTimeMillis() - start) / 1000L; + return (System.currentTimeMillis() - this.start) / 1000L; } - + public void run() { this.start = System.currentTimeMillis(); try { - String targetstub = sourcefile.getName(); + String targetstub = this.sourcefile.getName(); int p = targetstub.lastIndexOf("\\."); if (p > 0) targetstub = targetstub.substring(0, p); - InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1024 * 1024); - if (sourcefile.getName().endsWith(".bz2")) { + InputStream is = new BufferedInputStream(new FileInputStream(this.sourcefile), 1024 * 1024); + if (this.sourcefile.getName().endsWith(".bz2")) { int b = is.read(); if (b != 'B') throw new IOException("Invalid bz2 content."); b = is.read(); if (b != 'Z') throw new IOException("Invalid bz2 content."); is = new CBZip2InputStream(is); - } else if (sourcefile.getName().endsWith(".gz")) { + } else if (this.sourcefile.getName().endsWith(".gz")) { is = new GZIPInputStream(is); } - BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024); + final BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024); String t; StringBuilder sb = new StringBuilder(); boolean page = false, text = false; String title = null; - wikiparserrecord poison = newRecord(); - int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1); - BlockingQueue in = new ArrayBlockingQueue(threads * 10); - BlockingQueue out = new ArrayBlockingQueue(threads * 10); - ExecutorService service = Executors.newFixedThreadPool(threads + 1); - convertConsumer[] consumers = new convertConsumer[threads]; - Future[] consumerResults = new Future[threads]; + final wikiparserrecord poison = newRecord(); + final int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1); + final BlockingQueue in = new ArrayBlockingQueue(threads * 10); + final BlockingQueue out = new ArrayBlockingQueue(threads * 10); + final ExecutorService service = Executors.newFixedThreadPool(threads + 1); + final convertConsumer[] consumers = new convertConsumer[threads]; + final Future[] consumerResults = new Future[threads]; for (int i = 0; i < threads; i++) { consumers[i] = new convertConsumer(in, out, poison); consumerResults[i] = service.submit(consumers[i]); } - convertWriter writer = new convertWriter(out, poison, targetdir, targetstub); - Future writerResult = service.submit(writer); - + final convertWriter writer = new convertWriter(out, poison, this.targetdir, targetstub); + final Future writerResult = service.submit(writer); + wikiparserrecord record; int q; while ((t = r.readLine()) != null) { if ((p = t.indexOf("")) >= 0 && (q = t.indexOf("", p)) > 0) { //urlStub = "http://" + lang + ".wikipedia.org/wiki/"; - urlStub = t.substring(p + 6, q); - if (!urlStub.endsWith("/")) { - q = urlStub.lastIndexOf('/'); - if (q > 0) urlStub = urlStub.substring(0, q + 1); + this.urlStub = t.substring(p + 6, q); + if (!this.urlStub.endsWith("/")) { + q = this.urlStub.lastIndexOf('/'); + if (q > 0) this.urlStub = this.urlStub.substring(0, q + 1); } - DigestURI uri = new DigestURI(urlStub); - hostport = uri.getHost(); - if (uri.getPort() != 80) hostport += ":" + uri.getPort(); + final DigestURI uri = new DigestURI(this.urlStub); + this.hostport = uri.getHost(); + if (uri.getPort() != 80) this.hostport += ":" + uri.getPort(); continue; } if (t.indexOf(pagestart) >= 0) { @@ -192,7 +192,7 @@ public class MediawikiImporter extends Thread implements Importer { text = page; q = t.indexOf('>', p + textstart.length()); if (q > 0) { - int u = t.indexOf(textend, q + 1); + final int u = t.indexOf(textend, q + 1); if (u > q) { sb.append(t.substring(q + 1, u)); Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title); @@ -200,11 +200,11 @@ public class MediawikiImporter extends Thread implements Importer { Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content"); continue; } - record = newRecord(hostport, urlStub, title, sb); + record = newRecord(this.hostport, this.urlStub, title, sb); try { in.put(record); this.count++; - } catch (InterruptedException e1) { + } catch (final InterruptedException e1) { Log.logException(e1); } sb = new StringBuilder(200); @@ -222,11 +222,11 @@ public class MediawikiImporter extends Thread implements Importer { Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content"); continue; } - record = newRecord(hostport, urlStub, title, sb); + record = newRecord(this.hostport, this.urlStub, title, sb); try { in.put(record); this.count++; - } catch (InterruptedException e1) { + } catch (final InterruptedException e1) { Log.logException(e1); } sb = new StringBuilder(200); @@ -248,7 +248,7 @@ public class MediawikiImporter extends Thread implements Importer { } } r.close(); - + try { for (int i = 0; i < threads; i++) { in.put(poison); @@ -258,35 +258,35 @@ public class MediawikiImporter extends Thread implements Importer { } out.put(poison); writerResult.get(10000, TimeUnit.MILLISECONDS); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); - } catch (ExecutionException e) { + } catch (final ExecutionException e) { Log.logException(e); - } catch (TimeoutException e) { + } catch (final TimeoutException e) { Log.logException(e); - } catch (Exception e) { + } catch (final Exception e) { Log.logException(e); } - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); - } catch (Exception e) { + } catch (final Exception e) { Log.logException(e); } } - - public static void checkIndex(File mediawikixml) { - File idx = idxFromMediawikiXML(mediawikixml); + + public static void checkIndex(final File mediawikixml) { + final File idx = idxFromMediawikiXML(mediawikixml); if (idx.exists()) return; new indexMaker(mediawikixml).start(); } - + public static class indexMaker extends Thread { - + File mediawikixml; - public indexMaker(File mediawikixml) { + public indexMaker(final File mediawikixml) { this.mediawikixml = mediawikixml; } - + public void run() { try { createIndex(this.mediawikixml); @@ -296,24 +296,24 @@ public class MediawikiImporter extends Thread implements Importer { } } } - - public static File idxFromMediawikiXML(File mediawikixml) { + + public static File idxFromMediawikiXML(final File mediawikixml) { return new File(mediawikixml.getAbsolutePath() + ".idx.xml"); } - - public static void createIndex(File dumpFile) throws IOException { + + public static void createIndex(final File dumpFile) throws IOException { // calculate md5 //String md5 = serverCodings.encodeMD5Hex(dumpFile); - + // init reader, producer and consumer - PositionAwareReader in = new PositionAwareReader(dumpFile); - indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile)); - wikiConsumer consumer = new wikiConsumer(100, producer); - ExecutorService service = Executors.newFixedThreadPool(2); - Future producerResult = service.submit(consumer); - Future consumerResult = service.submit(producer); + final PositionAwareReader in = new PositionAwareReader(dumpFile); + final indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile)); + final wikiConsumer consumer = new wikiConsumer(100, producer); + final ExecutorService service = Executors.newFixedThreadPool(2); + final Future producerResult = service.submit(consumer); + final Future consumerResult = service.submit(producer); service.shutdown(); - + // read the wiki dump long start, stop; while (in.seek(pagestartb)) { @@ -324,18 +324,18 @@ public class MediawikiImporter extends Thread implements Importer { consumer.consume(new wikiraw(in.bytes(), start, stop)); in.resetBuffer(); } - + // shut down the services try { consumer.consume(wikiConsumer.poison); - try {consumerResult.get(5000, TimeUnit.MILLISECONDS);} catch (TimeoutException e) {} + try {consumerResult.get(5000, TimeUnit.MILLISECONDS);} catch (final TimeoutException e) {} producer.consume(indexProducer.poison); if (!consumerResult.isDone()) consumerResult.get(); producerResult.get(); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); return; - } catch (ExecutionException e) { + } catch (final ExecutionException e) { Log.logException(e); return; } @@ -348,120 +348,120 @@ public class MediawikiImporter extends Thread implements Importer { PrintWriter out; protected static wikisourcerecord poison = new wikisourcerecord("", 0, 0); int count; - - public indexProducer(int bufferCount, File indexFile) throws IOException { - entries = new ArrayBlockingQueue(bufferCount); - out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile))); - count = 0; - out.println(""); - - } - - public void consume(wikisourcerecord b) { + + public indexProducer(final int bufferCount, final File indexFile) throws IOException { + this.entries = new ArrayBlockingQueue(bufferCount); + this.out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile))); + this.count = 0; + this.out.println(""); + + } + + public void consume(final wikisourcerecord b) { try { - entries.put(b); - } catch (InterruptedException e) { + this.entries.put(b); + } catch (final InterruptedException e) { Log.logException(e); } } - + public Integer call() { wikisourcerecord r; try { while(true) { - r = entries.take(); + r = this.entries.take(); if (r == poison) { Log.logInfo("WIKITRANSLATION", "producer / got poison"); break; } - out.println(" "); - out.println(" " + r.title + ""); - out.println(" "); + this.out.println(" "); + this.out.println(" " + r.title + ""); + this.out.println(" "); Log.logInfo("WIKITRANSLATION", "producer / record start: " + r.start + ", title : " + r.title); - count++; + this.count++; } - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } - entries.clear(); - out.println(""); - out.close(); - return Integer.valueOf(count); + this.entries.clear(); + this.out.println(""); + this.out.close(); + return Integer.valueOf(this.count); } - + } - + private static class wikiConsumer implements Callable { private final BlockingQueue entries; protected static wikiraw poison = new wikiraw(new byte[0], 0, 0); private final indexProducer producer; private int count; - - public wikiConsumer(int bufferCount, indexProducer producer) { - entries = new ArrayBlockingQueue(bufferCount); + + public wikiConsumer(final int bufferCount, final indexProducer producer) { + this.entries = new ArrayBlockingQueue(bufferCount); this.producer = producer; - count = 0; + this.count = 0; } - - public void consume(wikiraw b) { + + public void consume(final wikiraw b) { try { - entries.put(b); - } catch (InterruptedException e) { + this.entries.put(b); + } catch (final InterruptedException e) { Log.logException(e); } } - + public Integer call() { wikisourcerecord r; wikiraw c; try { while(true) { - c = entries.take(); + c = this.entries.take(); if (c == poison) { Log.logInfo("WIKITRANSLATION", "consumer / got poison"); break; } try { r = new wikisourcerecord(c.b, c.start, c.end); - producer.consume(r); + this.producer.consume(r); Log.logInfo("WIKITRANSLATION", "consumer / record start: " + r.start + ", title : " + r.title); - count++; - } catch (RuntimeException e) {} + this.count++; + } catch (final RuntimeException e) {} } - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } - entries.clear(); - return Integer.valueOf(count); + this.entries.clear(); + return Integer.valueOf(this.count); } - + } private static class wikiraw { public long start, end; public byte[] b; - public wikiraw(byte[] b, long start, long end) { + public wikiraw(final byte[] b, final long start, final long end) { this.b = b; this.start = start; this.end = end; } } - + public static class wikisourcerecord { public long start, end; public String title; - public wikisourcerecord(String title, long start, long end) { + public wikisourcerecord(final String title, final long start, final long end) { this.title = title; this.start = start; this.end = end; } - public wikisourcerecord(byte[] chunk, long start, long end) { + public wikisourcerecord(final byte[] chunk, final long start, final long end) { String s; s = UTF8.String(chunk); - int t0 = s.indexOf(""); + final int t0 = s.indexOf("<title>"); if (t0 >= 0) { - int t1 = s.indexOf("", t0); + final int t1 = s.indexOf("", t0); if (t1 >= 0) { this.title = s.substring(t0 + 7, t1); } else { @@ -470,7 +470,7 @@ public class MediawikiImporter extends Thread implements Importer { } else { throw new RuntimeException("no title start in record"); } - + this.start = start; this.end = end; } @@ -478,16 +478,16 @@ public class MediawikiImporter extends Thread implements Importer { public wikiparserrecord newRecord() { return new wikiparserrecord(null, null, null, null); } - public wikiparserrecord newRecord(String hostport, String urlStub, String title, StringBuilder sb) { + public wikiparserrecord newRecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) { return new wikiparserrecord(hostport, urlStub, title, sb); } - + public class wikiparserrecord { public String title; String source, html, hostport, urlStub; DigestURI url; Document document; - public wikiparserrecord(String hostport, String urlStub, String title, StringBuilder sb) { + public wikiparserrecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) { this.title = title; this.hostport = hostport; this.urlStub = urlStub; @@ -495,97 +495,97 @@ public class MediawikiImporter extends Thread implements Importer { } public void genHTML() throws IOException { try { - WikiParser wparser = new WikiCode(); - html = wparser.transform(hostport, source); - } catch (Exception e) { + final WikiParser wparser = new WikiCode(); + this.html = wparser.transform(this.hostport, this.source); + } catch (final Exception e) { Log.logException(e); throw new IOException(e.getMessage()); } } public void genDocument() throws Parser.Failure { try { - url = new DigestURI(urlStub + title); - Document[] parsed = TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html)); - document = Document.mergeDocuments(url, "text/html", parsed); + this.url = new DigestURI(this.urlStub + this.title); + final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html), false); + this.document = Document.mergeDocuments(this.url, "text/html", parsed); // the wiki parser is not able to find the proper title in the source text, so it must be set here - document.setTitle(title); - } catch (MalformedURLException e1) { + this.document.setTitle(this.title); + } catch (final MalformedURLException e1) { Log.logException(e1); } } - public void writeXML(OutputStreamWriter os) throws IOException { - document.writeXML(os, new Date()); + public void writeXML(final OutputStreamWriter os) throws IOException { + this.document.writeXML(os, new Date()); } } - + private static class PositionAwareReader { - + private final InputStream is; private long seekpos; private ByteBuffer bb; - - public PositionAwareReader(File dumpFile) throws FileNotFoundException { + + public PositionAwareReader(final File dumpFile) throws FileNotFoundException { this.is = new BufferedInputStream(new FileInputStream(dumpFile), 64 *1024); this.seekpos = 0; this.bb = new ByteBuffer(); } - + public void resetBuffer() { - if (bb.length() > 10 * 1024) bb = new ByteBuffer(); else bb.clear(); + if (this.bb.length() > 10 * 1024) this.bb = new ByteBuffer(); else this.bb.clear(); } - - public boolean seek(byte[] pattern) throws IOException { + + public boolean seek(final byte[] pattern) throws IOException { int pp = 0; int c; - while ((c = is.read()) >= 0) { - seekpos++; - bb.append(c); + while ((c = this.is.read()) >= 0) { + this.seekpos++; + this.bb.append(c); if (pattern[pp] == c) pp++; else pp = 0; if (pp == pattern.length) return true; } return false; } - + public long pos() { - return seekpos; + return this.seekpos; } - + public byte[] bytes() { - return bb.getBytes(); + return this.bb.getBytes(); } - + public void close() { try { - is.close(); - } catch (IOException e) { + this.is.close(); + } catch (final IOException e) { Log.logException(e); } } } - public static byte[] read(File f, long start, int len) { - byte[] b = new byte[len]; + public static byte[] read(final File f, final long start, final int len) { + final byte[] b = new byte[len]; RandomAccessFile raf = null; try { raf = new RandomAccessFile(f, "r"); raf.seek(start); raf.read(b); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); return null; } finally { if (raf != null) try { raf.close(); - try{raf.getChannel().close();} catch (IOException e) {} - } catch (IOException e) { } + try{raf.getChannel().close();} catch (final IOException e) {} + } catch (final IOException e) { } } return b; } - - public static wikisourcerecord find(String title, File f) throws IOException { - PositionAwareReader in = new PositionAwareReader(f); + + public static wikisourcerecord find(final String title, final File f) throws IOException { + final PositionAwareReader in = new PositionAwareReader(f); long start; - String m = "" + title + ""; + final String m = "" + title + ""; String s; while (in.seek(UTF8.getBytes(" { private final BlockingQueue in; @@ -666,12 +666,12 @@ public class MediawikiImporter extends Thread implements Importer { private final File targetdir; private int fc, rc; private String outputfilename; - + public convertWriter( - BlockingQueue in, - wikiparserrecord poison, - File targetdir, - String targetstub) { + final BlockingQueue in, + final wikiparserrecord poison, + final File targetdir, + final String targetstub) { this.poison = poison; this.in = in; this.osw = null; @@ -681,63 +681,63 @@ public class MediawikiImporter extends Thread implements Importer { this.rc = 0; this.outputfilename = null; } - + public Integer call() { wikiparserrecord record; try { while(true) { - record = in.take(); - if (record == poison) { + record = this.in.take(); + if (record == this.poison) { Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison"); break; } - - if (osw == null) { + + if (this.osw == null) { // start writing a new file - this.outputfilename = targetstub + "." + fc + ".xml.prt"; - this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8"); - osw.write("\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n"); + this.outputfilename = this.targetstub + "." + this.fc + ".xml.prt"; + this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(this.targetdir, this.outputfilename))), "UTF-8"); + this.osw.write("\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n"); } Log.logInfo("WIKITRANSLATION", "[CONSUME] Title: " + record.title); - record.document.writeXML(osw, new Date()); - rc++; - if (rc >= 10000) { - osw.write("\n"); - osw.close(); - String finalfilename = targetstub + "." + fc + ".xml"; - new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename)); - rc = 0; - fc++; - outputfilename = targetstub + "." + fc + ".xml.prt"; - osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8"); - osw.write("\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n"); + record.document.writeXML(this.osw, new Date()); + this.rc++; + if (this.rc >= 10000) { + this.osw.write("\n"); + this.osw.close(); + final String finalfilename = this.targetstub + "." + this.fc + ".xml"; + new File(this.targetdir, this.outputfilename).renameTo(new File(this.targetdir, finalfilename)); + this.rc = 0; + this.fc++; + this.outputfilename = this.targetstub + "." + this.fc + ".xml.prt"; + this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(this.targetdir, this.outputfilename))), "UTF-8"); + this.osw.write("\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n"); } } - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); - } catch (UnsupportedEncodingException e) { + } catch (final UnsupportedEncodingException e) { Log.logException(e); - } catch (FileNotFoundException e) { + } catch (final FileNotFoundException e) { Log.logException(e); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } finally { try { - osw.write(SurrogateReader.SURROGATES_MAIN_ELEMENT_CLOSE + "\n"); - osw.close(); - String finalfilename = targetstub + "." + fc + ".xml"; - new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename)); - } catch (IOException e) { + this.osw.write(SurrogateReader.SURROGATES_MAIN_ELEMENT_CLOSE + "\n"); + this.osw.close(); + final String finalfilename = this.targetstub + "." + this.fc + ".xml"; + new File(this.targetdir, this.outputfilename).renameTo(new File(this.targetdir, finalfilename)); + } catch (final IOException e) { Log.logException(e); } } Log.logInfo("WIKITRANSLATION", "*** convertWriter has terminated"); return Integer.valueOf(0); } - + } - - public static void main(String[] s) { + + public static void main(final String[] s) { if (s.length == 0) { Log.logInfo("WIKITRANSLATION", "usage:"); Log.logInfo("WIKITRANSLATION", " -index "); @@ -751,47 +751,47 @@ public class MediawikiImporter extends Thread implements Importer { // java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/ if (s[0].equals("-convert") && s.length > 2) { - File sourcefile = new File(s[1]); - File targetdir = new File(s[2]); + final File sourcefile = new File(s[1]); + final File targetdir = new File(s[2]); //String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/ //String language = urlStub.substring(7,9); try { - MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir); + final MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir); mi.start(); mi.join(); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } } - - if (s[0].equals("-index")) { + + if (s[0].equals("-index")) { try { createIndex(new File(s[1])); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } } - + if (s[0].equals("-read")) { - long start = Integer.parseInt(s[1]); - int len = Integer.parseInt(s[2]); + final long start = Integer.parseInt(s[1]); + final int len = Integer.parseInt(s[2]); System.out.println(UTF8.String(read(new File(s[3]), start, len))); } - + if (s[0].equals("-find")) { try { - wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml")); + final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml")); if (w == null) { Log.logInfo("WIKITRANSLATION", "not found"); } else { System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start)))); } - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } - + } System.exit(0); } - + } diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 7e010ff33..e2dba1b6c 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -1,4 +1,4 @@ -//bzipParser.java +//bzipParser.java //------------------------ //part of YaCy //(C) by Michael Peter Christen; mc@yacy.net @@ -42,26 +42,26 @@ import org.apache.tools.bzip2.CBZip2InputStream; public class bzipParser extends AbstractParser implements Parser { - - public bzipParser() { + + public bzipParser() { super("Bzip 2 UNIX Compressed File Parser"); - SUPPORTED_EXTENSIONS.add("bz2"); - SUPPORTED_EXTENSIONS.add("tbz"); - SUPPORTED_EXTENSIONS.add("tbz2"); - SUPPORTED_MIME_TYPES.add("application/x-bzip2"); - SUPPORTED_MIME_TYPES.add("application/bzip2"); - SUPPORTED_MIME_TYPES.add("application/x-bz2"); - SUPPORTED_MIME_TYPES.add("application/x-bzip"); - SUPPORTED_MIME_TYPES.add("application/x-stuffit"); + this.SUPPORTED_EXTENSIONS.add("bz2"); + this.SUPPORTED_EXTENSIONS.add("tbz"); + this.SUPPORTED_EXTENSIONS.add("tbz2"); + this.SUPPORTED_MIME_TYPES.add("application/x-bzip2"); + this.SUPPORTED_MIME_TYPES.add("application/bzip2"); + this.SUPPORTED_MIME_TYPES.add("application/x-bz2"); + this.SUPPORTED_MIME_TYPES.add("application/x-bzip"); + this.SUPPORTED_MIME_TYPES.add("application/x-stuffit"); } - + public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { - + File tempFile = null; Document[] docs; - try { + try { /* * First we have to consume the first two char from the stream. Otherwise * the bzip decompression will fail with a nullpointerException! @@ -73,31 +73,31 @@ public class bzipParser extends AbstractParser implements Parser { b = source.read(); if (b != 'Z') { throw new Exception("Invalid bz2 content."); - } - + } + int read = 0; final byte[] data = new byte[1024]; - final CBZip2InputStream zippedContent = new CBZip2InputStream(source); - + final CBZip2InputStream zippedContent = new CBZip2InputStream(source); + tempFile = File.createTempFile("bunzip","tmp"); tempFile.deleteOnExit(); - + // creating a temp file to store the uncompressed data final FileOutputStream out = new FileOutputStream(tempFile); - + // reading gzip file and store it uncompressed while((read = zippedContent.read(data, 0, 1024)) != -1) { out.write(data, 0, read); } zippedContent.close(); out.close(); - + // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, tempFile); - } catch (final Exception e) { + docs = TextParser.parseSource(location, null, null, tempFile, false); + } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; - + throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location); } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index f3452b6c3..0680b9e22 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -1,4 +1,4 @@ -//gzipParser.java +//gzipParser.java //------------------------ //part of YaCy //(C) by Michael Peter Christen; mc@yacy.net @@ -42,52 +42,52 @@ import net.yacy.kelondro.util.FileUtils; public class gzipParser extends AbstractParser implements Parser { - public gzipParser() { + public gzipParser() { super("GNU Zip Compressed Archive Parser"); - SUPPORTED_EXTENSIONS.add("gz"); - SUPPORTED_EXTENSIONS.add("tgz"); - SUPPORTED_MIME_TYPES.add("application/x-gzip"); - SUPPORTED_MIME_TYPES.add("application/gzip"); - SUPPORTED_MIME_TYPES.add("application/x-gunzip"); - SUPPORTED_MIME_TYPES.add("application/gzipped"); - SUPPORTED_MIME_TYPES.add("application/gzip-compressed"); - SUPPORTED_MIME_TYPES.add("gzip/document"); + this.SUPPORTED_EXTENSIONS.add("gz"); + this.SUPPORTED_EXTENSIONS.add("tgz"); + this.SUPPORTED_MIME_TYPES.add("application/x-gzip"); + this.SUPPORTED_MIME_TYPES.add("application/gzip"); + this.SUPPORTED_MIME_TYPES.add("application/x-gunzip"); + this.SUPPORTED_MIME_TYPES.add("application/gzipped"); + this.SUPPORTED_MIME_TYPES.add("application/gzip-compressed"); + this.SUPPORTED_MIME_TYPES.add("gzip/document"); } - + public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { - + File tempFile = null; Document[] docs = null; - try { + try { int read = 0; final byte[] data = new byte[1024]; - + final GZIPInputStream zippedContent = new GZIPInputStream(source); - + tempFile = File.createTempFile("gunzip","tmp"); tempFile.deleteOnExit(); - + // creating a temp file to store the uncompressed data final FileOutputStream out = new FileOutputStream(tempFile); - + // reading gzip file and store it uncompressed while ((read = zippedContent.read(data, 0, 1024)) != -1) { out.write(data, 0, read); } zippedContent.close(); out.close(); - + // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location,null,null,tempFile); - } catch (final Exception e) { + docs = TextParser.parseSource(location,null,null,tempFile, false); + } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; - - throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location); + + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location); } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } return docs; } - + } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index f248ad99d..69c9f078d 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -32,20 +32,15 @@ import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; -import java.util.ArrayList; -import java.util.Map; -import java.util.Map.Entry; import java.util.regex.Pattern; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.document.AbstractParser; -import net.yacy.document.Classification; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ContentScraper; -import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.util.FileUtils; @@ -96,78 +91,14 @@ public class htmlParser extends AbstractParser implements Parser { try { // first get a document from the parsed html - ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream); - Document document = transformScraper(location, mimeType, documentCharset, scraper); - - // then produce virtual documents for each of the link that is contained in the document! - ArrayList docs = new ArrayList(); - docs.add(document); - for (Map.Entry link: document.getApplinks().entrySet()) { - addLinkDocs(docs, "application", link.getKey(), link.getValue(), scraper); - } - for (Map.Entry link: document.getAudiolinks().entrySet()) { - addLinkDocs(docs, "audio", link.getKey(), link.getValue(), scraper); - } - for (Map.Entry link: document.getVideolinks().entrySet()) { - addLinkDocs(docs, "video", link.getKey(), link.getValue(), scraper); - } - for (Entry link: document.getImages().entrySet()) { - addImageDocs(docs, link.getValue()); - } - - - // finally return the list of documents - return docs.toArray(new Document[docs.size()]); + final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream); + final Document document = transformScraper(location, mimeType, documentCharset, scraper); + + return new Document[]{document}; } catch (final IOException e) { throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location); } } - - private final static void addLinkDocs(ArrayList docs, String type, MultiProtocolURI uri, String descr, ContentScraper scraper) { - //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr); - final Document doc = new Document( - uri, - Classification.ext2mime(uri.getFileExtension()), - "UTF-8", - null, - scraper.getContentLanguages(), - null, - descr, - "", - "", - new String[]{descr}, - type, - 0.0f, 0.0f, - uri.toNormalform(false, false), - null, - null, - null, - false); - docs.add(doc); - } - - private final static void addImageDocs(ArrayList docs, ImageEntry img) { - //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt()); - final Document doc = new Document( - img.url(), - Classification.ext2mime(img.url().getFileExtension()), - "UTF-8", - null, - null, - null, - img.alt(), - "", - "", - new String[]{img.alt()}, - "image", - 0.0f, 0.0f, - img.url().toNormalform(false, false), - null, - null, - null, - false); - docs.add(doc); - } /** * the transformScraper method transforms a scraper object into a document object @@ -211,7 +142,7 @@ public class htmlParser extends AbstractParser implements Parser { scraper.indexingDenied()); //scraper.close(); ppd.setFavicon(scraper.getFavicon()); - + return ppd; } diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index e4d0d1d2c..d42c625c9 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -1,10 +1,10 @@ -// sevenzipParser.java +// sevenzipParser.java // ------------------------------------- // part of YACY // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de // Frankfurt, Germany, 2004 -// +// // This file ist contributed by Franz Brausze // // $LastChangedDate$ @@ -15,12 +15,12 @@ // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. -// +// // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. -// +// // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -40,7 +40,6 @@ import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; - import SevenZip.ArchiveExtractCallback; import SevenZip.IInStream; import SevenZip.Archive.IInArchive; @@ -48,13 +47,13 @@ import SevenZip.Archive.SevenZipEntry; import SevenZip.Archive.SevenZip.Handler; public class sevenzipParser extends AbstractParser implements Parser { - + public sevenzipParser() { super("7zip Archive Parser"); - SUPPORTED_EXTENSIONS.add("7z"); - SUPPORTED_MIME_TYPES.add("application/x-7z-compressed"); + this.SUPPORTED_EXTENSIONS.add("7z"); + this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed"); } - + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException { final Document doc = new Document( location, @@ -68,7 +67,7 @@ public class sevenzipParser extends AbstractParser implements Parser { null, null, null, - 0.0f, 0.0f, + 0.0f, 0.0f, (Object)null, null, null, @@ -86,7 +85,7 @@ public class sevenzipParser extends AbstractParser implements Parser { super.log.logFine("processing archive contents..."); try { archive.Extract(null, -1, 0, aec); - return doc; + return doc; } catch (final IOException e) { if (e.getCause() instanceof InterruptedException) throw (InterruptedException)e.getCause(); @@ -99,7 +98,7 @@ public class sevenzipParser extends AbstractParser implements Parser { try { archive.close(); } catch (final IOException e) { } } } - + public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { @@ -114,12 +113,12 @@ public class sevenzipParser extends AbstractParser implements Parser { // wrapper class to redirect output of standard ArchiveExtractCallback to serverLog // and parse the extracted content private static class SZParserExtractCallback extends ArchiveExtractCallback { - + private final Log log; private ByteArrayOutputStream cfos = null; private final Document doc; private final String prefix; - + public SZParserExtractCallback(final Log logger, final IInArchive handler, final Document doc, final String prefix) { super.Init(handler); @@ -127,7 +126,7 @@ public class sevenzipParser extends AbstractParser implements Parser { this.doc = doc; this.prefix = prefix; } - + @Override public void PrepareOperation(final int arg0) { this.extractMode = (arg0 == IInArchive.NExtract_NAskMode_kExtract); @@ -143,7 +142,7 @@ public class sevenzipParser extends AbstractParser implements Parser { break; } } - + @Override public void SetOperationResult(final int arg0) throws IOException { if (arg0 != IInArchive.NExtract_NOperationResult_kOK) { @@ -159,16 +158,16 @@ public class sevenzipParser extends AbstractParser implements Parser { // throw new IOException("Unknown Error"); } } else try { - + if (this.cfos != null) { // parse the file Document[] theDocs; // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects - final MultiProtocolURI url = MultiProtocolURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath); + final MultiProtocolURI url = MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); - theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); - + theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray(), false); + this.doc.addSubDocuments(theDocs); } } catch (final Exception e) { @@ -177,7 +176,7 @@ public class sevenzipParser extends AbstractParser implements Parser { throw ex; } } - + @Override public OutputStream GetStream(final int index, final int askExtractMode) throws IOException { final SevenZipEntry item = super.archiveHandler.getEntry(index); @@ -185,10 +184,10 @@ public class sevenzipParser extends AbstractParser implements Parser { this.cfos = (item.isDirectory()) ? null : new ByteArrayOutputStream(); return this.cfos; } - + public String getCurrentFilePath() { return super.filePath; } } - + } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index 40d3cdd29..de59b926c 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -11,12 +11,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -45,31 +45,31 @@ import org.apache.tools.tar.TarInputStream; public class tarParser extends AbstractParser implements Parser { - public tarParser() { - super("Tape Archive File Parser"); - SUPPORTED_EXTENSIONS.add("tar"); - SUPPORTED_MIME_TYPES.add("application/x-tar"); - SUPPORTED_MIME_TYPES.add("application/tar"); - SUPPORTED_MIME_TYPES.add("applicaton/x-gtar"); - SUPPORTED_MIME_TYPES.add("multipart/x-tar"); + public tarParser() { + super("Tape Archive File Parser"); + this.SUPPORTED_EXTENSIONS.add("tar"); + this.SUPPORTED_MIME_TYPES.add("application/x-tar"); + this.SUPPORTED_MIME_TYPES.add("application/tar"); + this.SUPPORTED_MIME_TYPES.add("applicaton/x-gtar"); + this.SUPPORTED_MIME_TYPES.add("multipart/x-tar"); } - + public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { - + final List docacc = new ArrayList(); Document[] subDocs = null; final String ext = url.getFileExtension().toLowerCase(); if (ext.equals("gz") || ext.equals("tgz")) { try { source = new GZIPInputStream(source); - } catch (IOException e) { + } catch (final IOException e) { throw new Parser.Failure("tar parser: " + e.getMessage(), url); } } TarEntry entry; - final TarInputStream tis = new TarInputStream(source); + final TarInputStream tis = new TarInputStream(source); File tmp = null; - + // loop through the elements in the tar file and parse every single file inside while (true) { try { @@ -83,16 +83,16 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp); + subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp, false); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { - log.logWarning("tar parser entry " + name + ": " + e.getMessage()); + this.log.logWarning("tar parser entry " + name + ": " + e.getMessage()); } finally { if (tmp != null) FileUtils.deletedelete(tmp); } - } catch (IOException e) { - log.logWarning("tar parser:" + e.getMessage()); + } catch (final IOException e) { + this.log.logWarning("tar parser:" + e.getMessage()); break; } } diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index df1d4875c..b216fe099 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -87,7 +87,7 @@ public class zipParser extends AbstractParser implements Parser { FileUtils.copy(zis, tmp, entry.getSize()); final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); - docs = TextParser.parseSource(virtualURL, mime, null, tmp); + docs = TextParser.parseSource(virtualURL, mime, null, tmp, false); if (docs == null) continue; for (final Document d: docs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 330579aad..8bcff3335 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -367,7 +367,7 @@ public final class LoaderDispatcher { final String supportError = TextParser.supports(url, responseHeader.mime()); if (supportError != null) throw new IOException("no parser support: " + supportError); try { - documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent()); + documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent(), false); if (documents == null) throw new IOException("document == null"); } catch (final Exception e) { throw new IOException("parser error: " + e.getMessage()); diff --git a/test/de/anomic/document/ParserTest.java b/test/de/anomic/document/ParserTest.java index 61901fcb6..b26529b53 100644 --- a/test/de/anomic/document/ParserTest.java +++ b/test/de/anomic/document/ParserTest.java @@ -1,29 +1,30 @@ package de.anomic.document; -import static org.junit.Assert.*; -import static org.junit.matchers.JUnitMatchers.*; -import net.yacy.document.Document; -import net.yacy.document.Parser; -import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; - -import org.junit.Test; +import static org.junit.Assert.assertThat; +import static org.junit.matchers.JUnitMatchers.containsString; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; -import java.io.Reader; import java.io.InputStreamReader; +import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.TextParser; +import net.yacy.kelondro.data.meta.DigestURI; + +import org.junit.Test; + public class ParserTest { @Test public void testParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException { - String[][] testFiles = new String[][] { - // meaning: filename in test/parsertest, mimetype, title, creator, description, + final String[][] testFiles = new String[][] { + // meaning: filename in test/parsertest, mimetype, title, creator, description, new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""}, new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""}, new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"}, @@ -34,26 +35,26 @@ public class ParserTest { }; - for (int i=0; i < testFiles.length; i++) { - String filename = "test/parsertest/" + testFiles[i][0]; - File file = new File(filename); - String mimetype = testFiles[i][1]; - DigestURI url = new DigestURI("http://localhost/"+filename); + for (final String[] testFile : testFiles) { + final String filename = "test/parsertest/" + testFile[0]; + final File file = new File(filename); + final String mimetype = testFile[1]; + final DigestURI url = new DigestURI("http://localhost/"+filename); - Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file)); - for (Document doc: docs) { - Reader content = new InputStreamReader(doc.getText(), doc.getCharset()); - StringBuilder str = new StringBuilder(); + final Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file), true); + for (final Document doc: docs) { + final Reader content = new InputStreamReader(doc.getText(), doc.getCharset()); + final StringBuilder str = new StringBuilder(); int c; while( (c = content.read()) != -1 ) str.append((char)c); - + System.out.println("Parsed " + filename + ": " + str); assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen")); - assertThat(doc.dc_title(), containsString(testFiles[i][2])); - assertThat(doc.dc_creator(), containsString(testFiles[i][3])); - assertThat(doc.dc_description(), containsString(testFiles[i][4])); - } + assertThat(doc.dc_title(), containsString(testFile[2])); + assertThat(doc.dc_creator(), containsString(testFile[3])); + assertThat(doc.dc_description(), containsString(testFile[4])); + } } } }