diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 0497334e6..b0de4efda 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -36,6 +36,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Set; +import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; @@ -46,7 +47,6 @@ import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; import de.anomic.data.userDB; import de.anomic.data.bookmarksDB.Tag; -import de.anomic.document.Document; import de.anomic.http.server.RequestHeader; import de.anomic.search.Segments; import de.anomic.search.Switchboard; diff --git a/htroot/ConfigParser.java b/htroot/ConfigParser.java index fe6d7d774..e412fb301 100644 --- a/htroot/ConfigParser.java +++ b/htroot/ConfigParser.java @@ -25,8 +25,8 @@ // javac -classpath .:../Classes Settings_p.java // if the shell's current path is HTROOT -import de.anomic.document.Idiom; -import de.anomic.document.Parser; +import net.yacy.document.Idiom; +import net.yacy.document.Parser; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; diff --git a/htroot/ContentIntegrationPHPBB3_p.java b/htroot/ContentIntegrationPHPBB3_p.java index 1b11afdf0..104f7ab58 100644 --- a/htroot/ContentIntegrationPHPBB3_p.java +++ b/htroot/ContentIntegrationPHPBB3_p.java @@ -24,11 +24,11 @@ import java.io.File; +import net.yacy.document.content.dao.Dao; +import net.yacy.document.content.dao.ImportDump; +import net.yacy.document.content.dao.PhpBB3Dao; import net.yacy.kelondro.util.DateFormatter; -import de.anomic.content.dao.Dao; -import de.anomic.content.dao.ImportDump; -import de.anomic.content.dao.PhpBB3Dao; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; diff --git a/htroot/FeedReader_p.java b/htroot/FeedReader_p.java index ff2255fff..0ba12e9ce 100644 --- a/htroot/FeedReader_p.java +++ b/htroot/FeedReader_p.java @@ -25,11 +25,11 @@ import java.io.IOException; import java.net.MalformedURLException; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.document.parser.xml.RSSReader; import net.yacy.kelondro.data.meta.DigestURI; -import de.anomic.content.RSSMessage; -import de.anomic.document.parser.xml.RSSFeed; -import de.anomic.document.parser.xml.RSSReader; import de.anomic.http.server.RequestHeader; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index f62541ceb..3a5c7f4d8 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -36,6 +36,7 @@ import java.util.Iterator; import java.util.List; import java.util.Set; +import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -50,7 +51,6 @@ import net.yacy.kelondro.util.DateFormatter; import de.anomic.data.AbstractBlacklist; import de.anomic.data.Blacklist; import de.anomic.data.listManager; -import de.anomic.document.Condenser; import de.anomic.http.server.RequestHeader; import de.anomic.search.QueryParams; import de.anomic.search.RankingProcess; diff --git a/htroot/Threaddump_p.java b/htroot/Threaddump_p.java index 92cc853c9..a515a6bd2 100644 --- a/htroot/Threaddump_p.java +++ b/htroot/Threaddump_p.java @@ -37,10 +37,10 @@ import java.util.Map.Entry; import java.util.ArrayList; import java.util.HashMap; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; -import de.anomic.document.parser.html.CharacterCoding; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 6d12c5294..1eb0381b6 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -35,16 +35,17 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import net.yacy.document.Condenser; +import net.yacy.document.Document; +import net.yacy.document.ParserException; +import net.yacy.document.parser.html.CharacterCoding; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.util.FileUtils; +import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.crawler.retrieval.Response; -import de.anomic.document.Condenser; -import de.anomic.document.ParserException; -import de.anomic.document.Document; -import de.anomic.document.parser.html.CharacterCoding; -import de.anomic.document.parser.html.ImageEntry; import de.anomic.http.client.Client; import de.anomic.http.client.Cache; import de.anomic.http.server.RequestHeader; @@ -266,7 +267,7 @@ public class ViewFile { // parsing the resource content Document document = null; try { - document = Document.parseDocument(url, resourceLength, resource); + document = LoaderDispatcher.parseDocument(url, resourceLength, resource); if (document == null) { prop.put("error", "5"); prop.put("error_errorText", "Unknown error"); diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index a60a71dcf..d0c2663f5 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -36,6 +36,8 @@ import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; @@ -45,8 +47,6 @@ import de.anomic.crawler.ZURL; import de.anomic.crawler.retrieval.Request; import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; -import de.anomic.document.parser.html.ContentScraper; -import de.anomic.document.parser.html.TransformerWriter; import de.anomic.http.server.RequestHeader; import de.anomic.search.Segment; import de.anomic.search.Segments; diff --git a/htroot/api/bookmarks/get_bookmarks.java b/htroot/api/bookmarks/get_bookmarks.java index 9c03f5590..0aba3d688 100644 --- a/htroot/api/bookmarks/get_bookmarks.java +++ b/htroot/api/bookmarks/get_bookmarks.java @@ -3,11 +3,11 @@ import java.util.Date; import java.util.Iterator; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.util.DateFormatter; import de.anomic.data.bookmarksDB; import de.anomic.data.userDB; -import de.anomic.document.parser.html.CharacterCoding; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; diff --git a/htroot/api/bookmarks/xbel/xbel.java b/htroot/api/bookmarks/xbel/xbel.java index 80dfd902f..4dea78c69 100755 --- a/htroot/api/bookmarks/xbel/xbel.java +++ b/htroot/api/bookmarks/xbel/xbel.java @@ -3,10 +3,10 @@ import java.util.Date; import java.util.Iterator; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.util.DateFormatter; import de.anomic.data.bookmarksDB; -import de.anomic.document.parser.html.CharacterCoding; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; diff --git a/htroot/api/feed.java b/htroot/api/feed.java index aec4b3fa5..76dc1f20b 100755 --- a/htroot/api/feed.java +++ b/htroot/api/feed.java @@ -2,8 +2,9 @@ import java.util.Date; -import de.anomic.content.RSSMessage; -import de.anomic.document.parser.xml.RSSFeed; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; + import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index f4cfffbf8..ceadbf74f 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -3,10 +3,11 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Set; +import net.yacy.document.parser.html.ContentScraper; import net.yacy.kelondro.data.meta.DigestURI; import de.anomic.crawler.CrawlProfile; -import de.anomic.document.parser.html.ContentScraper; +import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; @@ -49,11 +50,11 @@ public class getpageinfo_p { } ContentScraper scraper = null; if (u != null) try { - scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH); + scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH); } catch (final IOException e) { // try again, try harder try { - scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST); + scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST); } catch (final IOException ee) { // now thats a fail, do nothing } diff --git a/htroot/cytag.java b/htroot/cytag.java index 2e50eaeae..3ffec68fa 100644 --- a/htroot/cytag.java +++ b/htroot/cytag.java @@ -30,10 +30,10 @@ import java.io.File; import java.io.IOException; import java.util.Date; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; -import de.anomic.document.parser.html.CharacterCoding; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; diff --git a/htroot/rct_p.java b/htroot/rct_p.java index d65f481a4..7701bd63d 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -30,12 +30,12 @@ import java.text.ParseException; import java.util.Date; import java.util.Iterator; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.DateFormatter; -import de.anomic.content.RSSMessage; import de.anomic.crawler.retrieval.Request; -import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index 581a8739a..1d8535aec 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -44,7 +44,6 @@ import de.anomic.data.AbstractBlacklist; import de.anomic.data.listManager; import de.anomic.data.list.ListAccumulator; import de.anomic.data.list.XMLBlacklistImporter; -import de.anomic.document.parser.html.CharacterCoding; import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; @@ -52,6 +51,7 @@ import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacySeed; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 1684a62eb..9227d6f4c 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -34,6 +34,8 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeSet; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.order.Base64Order; @@ -42,8 +44,6 @@ import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.SortStack; import net.yacy.kelondro.util.ISO639; -import de.anomic.content.RSSMessage; -import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.net.natLib; diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 69c23bd7f..482283504 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -32,13 +32,13 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; -import de.anomic.content.RSSMessage; import de.anomic.data.Blacklist; -import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.server.RequestHeader; import de.anomic.search.Segments; import de.anomic.search.Switchboard; diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index a27846b58..455643a5f 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -29,12 +29,12 @@ import java.io.IOException; import java.text.ParseException; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.util.DateFormatter; -import de.anomic.content.RSSMessage; import de.anomic.data.Blacklist; -import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.server.RequestHeader; import de.anomic.search.Segments; import de.anomic.search.Switchboard; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 128ad9ba8..dd77671a3 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -33,6 +33,10 @@ import java.util.Iterator; import java.util.Set; import java.util.TreeSet; +import net.yacy.document.Condenser; +import net.yacy.document.Document; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -44,14 +48,10 @@ import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.SetTools; import net.yacy.kelondro.util.ISO639; -import de.anomic.content.RSSMessage; import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.data.DidYouMean; import de.anomic.data.LibraryProvider; import de.anomic.data.Location; -import de.anomic.document.Condenser; -import de.anomic.document.Document; -import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.search.QueryParams; diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 471275da4..7109b52d3 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -36,6 +36,8 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.table.SplitTable; @@ -43,10 +45,8 @@ import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.workflow.WorkflowJob; -import de.anomic.content.RSSMessage; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; -import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.client.Client; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; diff --git a/source/de/anomic/content/oai/PMHReader.java b/source/de/anomic/crawler/PMHReader.java similarity index 98% rename from source/de/anomic/content/oai/PMHReader.java rename to source/de/anomic/crawler/PMHReader.java index e79636871..ead620653 100644 --- a/source/de/anomic/content/oai/PMHReader.java +++ b/source/de/anomic/crawler/PMHReader.java @@ -24,17 +24,16 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.content.oai; +package de.anomic.crawler; import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.MalformedURLException; +import net.yacy.document.content.DCEntry; +import net.yacy.document.content.file.SurrogateReader; import net.yacy.kelondro.data.meta.DigestURI; -import de.anomic.content.DCEntry; -import de.anomic.content.file.SurrogateReader; -import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.crawler.retrieval.Request; diff --git a/source/de/anomic/crawler/ResultImages.java b/source/de/anomic/crawler/ResultImages.java index 9017ffc0c..f12540f8e 100755 --- a/source/de/anomic/crawler/ResultImages.java +++ b/source/de/anomic/crawler/ResultImages.java @@ -30,10 +30,10 @@ import java.util.HashMap; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; +import net.yacy.document.Document; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; -import de.anomic.document.Document; -import de.anomic.document.parser.html.ImageEntry; public class ResultImages { diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 0ebfea3fc..7b75bedc2 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -32,12 +32,12 @@ import java.io.IOException; import java.io.PrintStream; import java.util.Date; +import net.yacy.document.Parser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import de.anomic.crawler.Latency; -import de.anomic.document.Parser; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.http.server.ResponseHeader; diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 027bbb38b..5915ef8d7 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -28,12 +28,12 @@ package de.anomic.crawler.retrieval; import java.io.IOException; import java.util.Date; +import net.yacy.document.Parser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import de.anomic.crawler.Latency; import de.anomic.data.Blacklist; -import de.anomic.document.Parser; import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java index 93bc154ae..3618be3a9 100644 --- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java +++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java @@ -29,6 +29,7 @@ package de.anomic.crawler.retrieval; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.Writer; import java.util.Arrays; import java.util.Date; import java.util.HashSet; @@ -36,13 +37,17 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.ParserException; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import de.anomic.crawler.CrawlProfile; -import de.anomic.document.Document; -import de.anomic.document.ParserException; import de.anomic.http.client.Cache; +import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.http.server.ResponseHeader; @@ -352,7 +357,7 @@ public final class LoaderDispatcher { // parse resource Document document = null; try { - document = Document.parseDocument(url, resContentLength, resContent, responseHeader); + document = parseDocument(url, resContentLength, resContent, responseHeader); } catch (final ParserException e) { Log.logFine("snippet fetch", "parser error " + e.getMessage() + " for url " + url); return null; @@ -362,6 +367,75 @@ public final class LoaderDispatcher { return document; } + /** + * Parse the resource + * @param url the URL of the resource + * @param contentLength the contentLength of the resource + * @param resourceStream the resource body as stream + * @param docInfo metadata about the resource + * @return the extracted data + * @throws ParserException + */ + public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream, ResponseHeader responseHeader) throws ParserException { + try { + if (resourceStream == null) return null; + + // STEP 1: if no resource metadata is available, try to load it from cache + if (responseHeader == null) { + // try to get the header from the htcache directory + try { + responseHeader = Cache.getResponseHeader(url); + } catch (final Exception e) { + // ignore this. resource info loading failed + } + } + + // STEP 2: if the metadata is still null try to download it from web + if ((responseHeader == null) && (url.getProtocol().startsWith("http"))) { + // TODO: we need a better solution here + // e.g. encapsulate this in the crawlLoader class + + // getting URL mimeType + try { + responseHeader = Client.whead(url.toString()); + } catch (final Exception e) { + // ingore this. http header download failed + } + } + + // STEP 3: if the metadata is still null try to guess the mimeType of the resource + String supportError = Parser.supports(url, responseHeader == null ? null : responseHeader.mime()); + if (supportError != null) { + return null; + } + if (responseHeader == null) { + return Parser.parseSource(url, null, null, contentLength, resourceStream); + } + return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream); + } catch (final InterruptedException e) { + // interruption of thread detected + return null; + } + } + + public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream) throws ParserException { + return parseDocument(url, contentLength, resourceStream, null); + } + + + public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException { + // load page + Response r = loader.load(location, true, false, cachePolicy); + byte[] page = (r == null) ? null : r.getContent(); + if (page == null) throw new IOException("no response from url " + location.toString()); + + // scrape content + final ContentScraper scraper = new ContentScraper(location); + final Writer writer = new TransformerWriter(null, null, scraper, null, false); + writer.write(new String(page, "UTF-8")); + + return scraper; + } public synchronized void cleanupAccessTimeTable(long timeout) { final Iterator> i = accessTime.entrySet().iterator(); diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 27f4c5ef9..cdcc44329 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -28,12 +28,12 @@ package de.anomic.crawler.retrieval; import java.util.Date; +import net.yacy.document.Classification; +import net.yacy.document.Parser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.DateFormatter; import de.anomic.crawler.CrawlProfile; -import de.anomic.document.Classification; -import de.anomic.document.Parser; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.http.server.ResponseHeader; diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index 7afa9539e..4b13e8fef 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -55,6 +55,8 @@ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.blob.Heap; import net.yacy.kelondro.blob.MapView; import net.yacy.kelondro.data.meta.DigestURI; @@ -76,8 +78,6 @@ import org.xml.sax.SAXException; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Request; -import de.anomic.document.parser.html.ContentScraper; -import de.anomic.document.parser.html.TransformerWriter; import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.yacy.yacyNewsPool; diff --git a/source/de/anomic/data/diff.java b/source/de/anomic/data/diff.java index c7687b537..95309e58f 100644 --- a/source/de/anomic/data/diff.java +++ b/source/de/anomic/data/diff.java @@ -30,7 +30,8 @@ package de.anomic.data; import java.util.ArrayList; -import de.anomic.document.parser.html.CharacterCoding; +import net.yacy.document.parser.html.CharacterCoding; + /** * This class provides a diff-functionality. diff --git a/source/de/anomic/data/wiki/wikiCode.java b/source/de/anomic/data/wiki/wikiCode.java index 951ef98a8..a597d5745 100644 --- a/source/de/anomic/data/wiki/wikiCode.java +++ b/source/de/anomic/data/wiki/wikiCode.java @@ -32,7 +32,8 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import de.anomic.document.parser.html.CharacterCoding; +import net.yacy.document.parser.html.CharacterCoding; + import de.anomic.server.serverCore; /** This class provides methods to handle texts that have been posted in the yacyWiki or other diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index dc28c0d8d..1ea21706b 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -81,6 +81,10 @@ import java.util.Properties; import java.util.concurrent.ConcurrentHashMap; import java.util.zip.GZIPOutputStream; +import net.yacy.document.Classification; +import net.yacy.document.parser.htmlParser; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.ByteBuffer; @@ -88,10 +92,6 @@ import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; -import de.anomic.document.Classification; -import de.anomic.document.parser.htmlParser; -import de.anomic.document.parser.html.ContentScraper; -import de.anomic.document.parser.html.ScraperInputStream; import de.anomic.http.server.servlets.transferURL; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 2b31485c0..1e5b6c71c 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -71,6 +71,9 @@ import java.util.logging.LogManager; import java.util.logging.Logger; import java.util.zip.GZIPOutputStream; +import net.yacy.document.Parser; +import net.yacy.document.parser.html.ContentTransformer; +import net.yacy.document.parser.html.Transformer; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.ByteCountOutputStream; import net.yacy.kelondro.logging.Log; @@ -82,9 +85,6 @@ import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; import de.anomic.data.Blacklist; -import de.anomic.document.Parser; -import de.anomic.document.parser.html.ContentTransformer; -import de.anomic.document.parser.html.Transformer; import de.anomic.http.client.MultiOutputStream; import de.anomic.http.client.Client; import de.anomic.http.client.RemoteProxyConfig; diff --git a/source/de/anomic/http/server/HTTPDemon.java b/source/de/anomic/http/server/HTTPDemon.java index ba8f8843b..06e4b9c2f 100644 --- a/source/de/anomic/http/server/HTTPDemon.java +++ b/source/de/anomic/http/server/HTTPDemon.java @@ -50,6 +50,7 @@ import java.util.StringTokenizer; import java.util.Map.Entry; import java.util.zip.GZIPInputStream; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; @@ -72,7 +73,6 @@ import org.apache.commons.httpclient.ChunkedInputStream; import org.apache.commons.httpclient.ContentLengthInputStream; import de.anomic.data.userDB; -import de.anomic.document.parser.html.CharacterCoding; import de.anomic.search.Switchboard; import de.anomic.server.serverCore; import de.anomic.server.serverHandler; diff --git a/source/de/anomic/http/server/TemplateEngine.java b/source/de/anomic/http/server/TemplateEngine.java index 2c32ac4d6..2b1181f7c 100644 --- a/source/de/anomic/http/server/TemplateEngine.java +++ b/source/de/anomic/http/server/TemplateEngine.java @@ -56,10 +56,10 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PushbackInputStream; import java.io.UnsupportedEncodingException; -import java.util.ArrayList; import java.util.HashMap; -import java.util.List; + +import net.yacy.document.parser.html.ContentTransformer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; @@ -122,83 +122,6 @@ import net.yacy.kelondro.util.FileUtils; */ public final class TemplateEngine { - public final static byte hash = (byte)'#'; - - public final static byte[] dpdpa = "::".getBytes(); - - private final static byte lbr = (byte)'['; - private final static byte rbr = (byte)']'; - private final static byte[] pOpen = {hash, lbr}; - private final static byte[] pClose = {rbr, hash}; - - private final static byte lcbr = (byte)'{'; - private final static byte rcbr = (byte)'}'; - private final static byte[] mOpen = {hash, lcbr}; - private final static byte[] mClose = {rcbr, hash}; - - private final static byte lrbr = (byte)'('; - private final static byte rrbr = (byte)')'; - private final static byte[] aOpen = {hash, lrbr}; - private final static byte[] aClose = {rrbr, hash}; - - private final static byte ps = (byte)'%'; - private final static byte[] iOpen = {hash, ps}; - private final static byte[] iClose = {ps, hash}; - - private final static byte[] slash = {(byte)'/'}; - - private final static Object[] meta_quotation = new Object[] { - new Object[] {pOpen, pClose}, - new Object[] {mOpen, mClose}, - new Object[] {aOpen, aClose}, - new Object[] {iOpen, iClose} - }; - - public final static ByteBuffer[] splitQuotations(final ByteBuffer text) { - final List l = splitQuotation(text, 0); - final ByteBuffer[] sbbs = new ByteBuffer[l.size()]; - for (int i = 0; i < l.size(); i++) sbbs[i] = l.get(i); - return sbbs; - } - - private final static List splitQuotation(ByteBuffer text, int qoff) { - final ArrayList l = new ArrayList(); - if (qoff >= meta_quotation.length) { - if (text.length() > 0) l.add(text); - return l; - } - int p = -1, q; - final byte[] left = (byte[]) ((Object[]) meta_quotation[qoff])[0]; - final byte[] right = (byte[]) ((Object[]) meta_quotation[qoff])[1]; - qoff++; - while ((text.length() > 0) && ((p = text.indexOf(left)) >= 0)) { - q = text.indexOf(right, p + 1); - if (q >= 0) { - // found a pattern - l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff)); - l.add(new ByteBuffer(text.getBytes(p, q + right.length - p))); - text = new ByteBuffer(text.getBytes(q + right.length)); - } else { - // found only pattern start, no closing parantesis (a syntax error that is silently accepted here) - l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff)); - l.addAll(splitQuotation(new ByteBuffer(text.getBytes(p)), qoff)); - text.clear(); - } - } - - // find double-points - while ((text.length() > 0) && ((p = text.indexOf(dpdpa)) >= 0)) { - l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff)); - l.add(new ByteBuffer(dpdpa)); - l.addAll(splitQuotation(new ByteBuffer(text.getBytes(p + 2)), qoff)); - text.clear(); - } - - // add remaining - if (text.length() > 0) l.addAll(splitQuotation(text, qoff)); - return l; - } - /** * transfer until a specified pattern is found; everything but the pattern is transfered so far * the function returns true, if the pattern is found @@ -254,13 +177,13 @@ public final class TemplateEngine { byte[] replacement; int bb; final ByteBuffer structure = new ByteBuffer(); - while (transferUntil(pis, out, hash)) { + while (transferUntil(pis, out, ContentTransformer.hashChar)) { bb = pis.read(); keyStream.reset(); // #{ - if ((bb & 0xFF) == lcbr) { //multi - if (transferUntil(pis, keyStream, mClose)) { //close tag + if ((bb & 0xFF) == ContentTransformer.lcbr) { //multi + if (transferUntil(pis, keyStream, ContentTransformer.mClose)) { //close tag //multi_key = "_" + keyStream.toString(); //for _Key bb = pis.read(); if ((bb & 0xFF) != 10){ //kill newline @@ -270,7 +193,7 @@ public final class TemplateEngine { keyStream.reset(); //reset stream //this needs multi_key without prefix - if (transferUntil(pis, keyStream, appendBytes(mOpen,slash,multi_key,mClose))){ + if (transferUntil(pis, keyStream, appendBytes(ContentTransformer.mOpen, ContentTransformer.slashChar, multi_key, ContentTransformer.mClose))){ bb = pis.read(); if((bb & 0xFF) != 10){ //kill newline pis.unread(bb); @@ -305,11 +228,11 @@ public final class TemplateEngine { } // #( - } else if ((bb & 0xFF) == lrbr) { //alternative + } else if ((bb & 0xFF) == ContentTransformer.lrbr) { //alternative int others=0; final ByteBuffer text= new ByteBuffer(); - transferUntil(pis, keyStream, aClose); + transferUntil(pis, keyStream, ContentTransformer.aClose); key = keyStream.toByteArray(); //Caution: Key does not contain prefix keyStream.reset(); //clear @@ -341,7 +264,7 @@ public final class TemplateEngine { return structure.getBytes(); } keyStream.reset(); - transferUntil(pis, keyStream, dpdpa); + transferUntil(pis, keyStream, ContentTransformer.dpdpa); pis2 = new PushbackInputStream(new ByteArrayInputStream(keyStream.toByteArray())); structure.append(writeTemplate(pis2, out, pattern, dflt, newPrefix(prefix,key))); transferUntil(pis, keyStream, appendBytes("#(/".getBytes(),key,")#".getBytes("UTF-8"),null)); @@ -351,13 +274,13 @@ public final class TemplateEngine { } else { while(!found){ bb=pis.read(); // performance problem? trace always points to this line - if ((bb & 0xFF) == hash){ + if ((bb & 0xFF) == ContentTransformer.hashChar){ bb=pis.read(); - if ((bb & 0xFF) == lrbr){ - transferUntil(pis, keyStream, aClose); + if ((bb & 0xFF) == ContentTransformer.lrbr){ + transferUntil(pis, keyStream, ContentTransformer.aClose); //reached the end. output last string. - if (java.util.Arrays.equals(keyStream.toByteArray(),appendBytes(slash, key, null,null))) { + if (java.util.Arrays.equals(keyStream.toByteArray(),appendBytes(ContentTransformer.slashChar, key, null,null))) { pis2 = new PushbackInputStream(new ByteArrayInputStream(text.getBytes())); //this maybe the wrong, but its the last structure.append('<').append(key).append(" type=\"alternative\" which=\"".getBytes()).append(Integer.toString(whichPattern).getBytes()).append("\" found=\"0\">\n".getBytes()); @@ -366,16 +289,16 @@ public final class TemplateEngine { found=true; }else if(others >0 && keyStream.toString().startsWith("/")){ //close nested others--; - text.append(aOpen).append(keyStream.toByteArray()).append(")#".getBytes()); + text.append(ContentTransformer.aOpen).append(keyStream.toByteArray()).append(")#".getBytes()); } else { //nested others++; - text.append(aOpen).append(keyStream.toByteArray()).append(")#".getBytes()); + text.append(ContentTransformer.aOpen).append(keyStream.toByteArray()).append(")#".getBytes()); } keyStream.reset(); //reset stream continue; } //is not #( pis.unread(bb);//is processed in next loop - bb = (hash);//will be added to text this loop + bb = (ContentTransformer.hashChar);//will be added to text this loop //text += "#"; }else if ((bb & 0xFF) == ':' && others==0){//ignore :: in nested Expressions bb=pis.read(); @@ -407,8 +330,8 @@ public final class TemplateEngine { }//if(byName) (else branch) // #[ - } else if ((bb & 0xFF) == lbr) { //normal - if (transferUntil(pis, keyStream, pClose)) { + } else if ((bb & 0xFF) == ContentTransformer.lbr) { //normal + if (transferUntil(pis, keyStream, ContentTransformer.pClose)) { // pattern detected, write replacement key = keyStream.toByteArray(); final String patternKey = getPatternKey(prefix, key); @@ -425,13 +348,13 @@ public final class TemplateEngine { } // #% - } else if ((bb & 0xFF) == ps) { //include + } else if ((bb & 0xFF) == ContentTransformer.pcChar) { //include final ByteBuffer include = new ByteBuffer(); keyStream.reset(); //reset stream - if(transferUntil(pis, keyStream, iClose)){ + if(transferUntil(pis, keyStream, ContentTransformer.iClose)){ byte[] filename = keyStream.toByteArray(); //if(filename.startsWith( Character.toString((char)lbr) ) && filename.endsWith( Character.toString((char)rbr) )){ //simple pattern for filename - if((filename[0] == lbr) && (filename[filename.length-1] == rbr)){ //simple pattern for filename + if((filename[0] == ContentTransformer.lbr) && (filename[filename.length-1] == ContentTransformer.rbr)){ //simple pattern for filename final byte[] newFilename = new byte[filename.length-2]; System.arraycopy(filename, 1, newFilename, 0, newFilename.length); final String patternkey = getPatternKey(prefix, newFilename); @@ -462,7 +385,7 @@ public final class TemplateEngine { // # - no special character. This is simply a '#' without meaning } else { //no match, but a single hash (output # + bb) - out.write(hash); + out.write(ContentTransformer.hashChar); out.write(bb); } } diff --git a/source/de/anomic/http/server/servlets/transferURL.java b/source/de/anomic/http/server/servlets/transferURL.java index 63e0d57cf..353730dea 100644 --- a/source/de/anomic/http/server/servlets/transferURL.java +++ b/source/de/anomic/http/server/servlets/transferURL.java @@ -5,12 +5,12 @@ package de.anomic.http.server.servlets; import java.io.IOException; import java.text.ParseException; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.util.DateFormatter; -import de.anomic.content.RSSMessage; import de.anomic.data.Blacklist; -import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.server.RequestHeader; import de.anomic.search.Segments; import de.anomic.search.Switchboard; diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java index 727c406e4..239a0040a 100644 --- a/source/de/anomic/search/DocumentIndex.java +++ b/source/de/anomic/search/DocumentIndex.java @@ -34,14 +34,14 @@ import java.util.Date; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import net.yacy.document.Condenser; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; -import de.anomic.document.Condenser; -import de.anomic.document.Document; -import de.anomic.document.Parser; -import de.anomic.document.ParserException; /** * convenience class to access the yacycore library from outside of yacy to put files into the index diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java index c1ec63b05..13fcc4c15 100644 --- a/source/de/anomic/search/MediaSnippet.java +++ b/source/de/anomic/search/MediaSnippet.java @@ -29,12 +29,12 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeSet; +import net.yacy.document.Document; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import de.anomic.crawler.retrieval.LoaderDispatcher; -import de.anomic.document.Document; -import de.anomic.document.parser.html.ImageEntry; public class MediaSnippet { public int type; diff --git a/source/de/anomic/search/MetadataRepository.java b/source/de/anomic/search/MetadataRepository.java index b1190e525..199294894 100644 --- a/source/de/anomic/search/MetadataRepository.java +++ b/source/de/anomic/search/MetadataRepository.java @@ -38,6 +38,7 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeSet; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReference; @@ -51,7 +52,6 @@ import net.yacy.kelondro.table.SplitTable; import net.yacy.kelondro.util.ScoreCluster; import de.anomic.data.Blacklist; -import de.anomic.document.parser.html.CharacterCoding; import de.anomic.http.client.Client; import de.anomic.http.client.RemoteProxyConfig; import de.anomic.http.server.ResponseContainer; diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index 0419bd19c..1715e20d4 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -30,6 +30,9 @@ import java.util.HashSet; import java.util.Iterator; import java.util.TreeSet; +import net.yacy.document.Condenser; +import net.yacy.document.parser.html.AbstractScraper; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.order.Base64Order; @@ -37,9 +40,6 @@ import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.util.SetTools; -import de.anomic.document.Condenser; -import de.anomic.document.parser.html.AbstractScraper; -import de.anomic.document.parser.html.CharacterCoding; import de.anomic.yacy.yacySeed; public final class QueryParams { diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 11268e689..558502d49 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -41,6 +41,7 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; +import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -54,7 +55,6 @@ import net.yacy.kelondro.rwi.TermSearch; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.SortStack; -import de.anomic.document.Condenser; import de.anomic.server.serverProfiling; import de.anomic.ymage.ProfilingGraph; diff --git a/source/de/anomic/search/ReferenceOrder.java b/source/de/anomic/search/ReferenceOrder.java index 04d899301..dc549ed24 100644 --- a/source/de/anomic/search/ReferenceOrder.java +++ b/source/de/anomic/search/ReferenceOrder.java @@ -32,6 +32,7 @@ import java.util.Map; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -40,7 +41,6 @@ import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.ScoreCluster; -import de.anomic.document.Condenser; public class ReferenceOrder { diff --git a/source/de/anomic/search/ResultEntry.java b/source/de/anomic/search/ResultEntry.java index b37ba5181..37bddc327 100644 --- a/source/de/anomic/search/ResultEntry.java +++ b/source/de/anomic/search/ResultEntry.java @@ -30,6 +30,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Date; +import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -37,7 +38,6 @@ import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.rwi.Reference; -import de.anomic.document.Condenser; import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeedDB; diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 931571377..2069cc8af 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -31,13 +31,13 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.TreeSet; +import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.SetTools; import net.yacy.kelondro.util.SortStack; import net.yacy.kelondro.util.SortStore; -import de.anomic.document.Condenser; import de.anomic.search.RankingProcess.NavigatorEntry; import de.anomic.search.MediaSnippet; import de.anomic.server.serverProfiling; diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index f3c342e1c..32642fbeb 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -34,6 +34,8 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeSet; +import net.yacy.document.Condenser; +import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.navigation.NavigationReference; @@ -53,8 +55,6 @@ import net.yacy.kelondro.util.ISO639; import de.anomic.crawler.retrieval.Response; import de.anomic.data.Blacklist; -import de.anomic.document.Condenser; -import de.anomic.document.Document; public class Segment { diff --git a/source/de/anomic/search/Segments.java b/source/de/anomic/search/Segments.java index 4cc84727b..14db36224 100644 --- a/source/de/anomic/search/Segments.java +++ b/source/de/anomic/search/Segments.java @@ -32,14 +32,14 @@ import java.util.Date; import java.util.HashMap; import java.util.Iterator; +import net.yacy.document.Condenser; +import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.rwi.IndexCell; -import de.anomic.document.Condenser; -import de.anomic.document.Document; public class Segments implements Iterable { diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index b4acc4712..ef9f2b683 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -111,6 +111,14 @@ import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; +import net.yacy.document.Condenser; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.ParserException; +import net.yacy.document.content.DCEntry; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.content.file.SurrogateReader; +import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -129,9 +137,6 @@ import net.yacy.kelondro.workflow.WorkflowJob; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowThread; -import de.anomic.content.DCEntry; -import de.anomic.content.RSSMessage; -import de.anomic.content.file.SurrogateReader; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.CrawlStacker; @@ -161,11 +166,6 @@ import de.anomic.data.userDB; import de.anomic.data.wiki.wikiBoard; import de.anomic.data.wiki.wikiCode; import de.anomic.data.wiki.wikiParser; -import de.anomic.document.Condenser; -import de.anomic.document.Parser; -import de.anomic.document.ParserException; -import de.anomic.document.Document; -import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.client.Client; import de.anomic.http.client.RemoteProxyConfig; import de.anomic.http.client.Cache; @@ -1696,12 +1696,6 @@ public final class Switchboard extends serverSwitch { } } - public indexingQueueEntry webStructureAnalysis(final indexingQueueEntry in) { - in.queueEntry.updateStatus(Response.QUEUE_STATE_STRUCTUREANALYSIS); - in.document.notifyWebStructure(webStructure, in.condenser, in.queueEntry.lastModified()); - return in; - } - public void storeDocumentIndex(final indexingQueueEntry in) { in.queueEntry.updateStatus(Response.QUEUE_STATE_INDEXSTORAGE); storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser); @@ -1839,7 +1833,7 @@ public final class Switchboard extends serverSwitch { final Long resourceContentLength = (Long) resource[1]; // parse the resource - final Document document = Document.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent); + final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent); // get the word set Set words = null; diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index 88cadf65d..da5678838 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -35,6 +35,10 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; +import net.yacy.document.Condenser; +import net.yacy.document.Document; +import net.yacy.document.ParserException; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -44,11 +48,8 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.SetTools; +import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.crawler.retrieval.Response; -import de.anomic.document.Condenser; -import de.anomic.document.Document; -import de.anomic.document.ParserException; -import de.anomic.document.parser.html.CharacterCoding; import de.anomic.http.client.Cache; import de.anomic.http.server.ResponseHeader; import de.anomic.yacy.yacySearch; @@ -387,7 +388,7 @@ public class TextSnippet { * =========================================================================== */ Document document = null; try { - document = Document.parseDocument(url, resContentLength, resContent, responseHeader); + document = LoaderDispatcher.parseDocument(url, resContentLength, resContent, responseHeader); } catch (final ParserException e) { return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed } finally { diff --git a/source/de/anomic/server/serverObjects.java b/source/de/anomic/server/serverObjects.java index cba260223..98180fd14 100644 --- a/source/de/anomic/server/serverObjects.java +++ b/source/de/anomic/server/serverObjects.java @@ -52,10 +52,10 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Map; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.Formatter; -import de.anomic.document.parser.html.CharacterCoding; import de.anomic.search.Switchboard; public class serverObjects extends HashMap implements Cloneable { diff --git a/source/de/anomic/tools/CryptoLib.java b/source/de/anomic/tools/CryptoLib.java index 172dd5bcd..fad015881 100644 --- a/source/de/anomic/tools/CryptoLib.java +++ b/source/de/anomic/tools/CryptoLib.java @@ -46,9 +46,9 @@ import java.security.spec.InvalidKeySpecException; import java.security.spec.PKCS8EncodedKeySpec; import java.security.spec.X509EncodedKeySpec; +import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.order.Base64Order; -import de.anomic.server.serverCharBuffer; /** * Tool functions to sign and verify files and generate keys @@ -141,7 +141,7 @@ public class CryptoLib { } else if(args[0].equals("--sign") && args.length==3) { CryptoLib cl = new CryptoLib(); - serverCharBuffer privKeyBuffer = new serverCharBuffer(new File(args[1])); + CharBuffer privKeyBuffer = new CharBuffer(new File(args[1])); byte[] privKeyByteBuffer = Base64Order.standardCoder.decode(privKeyBuffer.toString()); PrivateKey privKey = cl.getPrivateKeyFromBytes(privKeyByteBuffer); @@ -153,13 +153,13 @@ public class CryptoLib { signFile.close(); } else if(args[0].equals("--verify") && args.length==3) { CryptoLib cl = new CryptoLib(); - serverCharBuffer pubKeyBuffer = new serverCharBuffer(new File(args[1])); + CharBuffer pubKeyBuffer = new CharBuffer(new File(args[1])); byte[] pubKeyByteBuffer = Base64Order.standardCoder.decode(pubKeyBuffer.toString().trim()); PublicKey pubKey = cl.getPublicKeyFromBytes(pubKeyByteBuffer); FileInputStream dataStream = new FileInputStream(args[2]); - serverCharBuffer signBuffer = new serverCharBuffer(new File(args[2] + ".sig")); + CharBuffer signBuffer = new CharBuffer(new File(args[2] + ".sig")); byte[] signByteBuffer = Base64Order.standardCoder.decode(signBuffer.toString().trim()); if(cl.verifySignature(pubKey, dataStream, signByteBuffer)) { System.out.println("Signature OK!"); diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java index 9ee2bb1a0..bf74c95ed 100644 --- a/source/de/anomic/tools/mediawikiIndex.java +++ b/source/de/anomic/tools/mediawikiIndex.java @@ -26,6 +26,9 @@ package de.anomic.tools; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.ByteBuffer; @@ -62,9 +65,6 @@ import java.util.concurrent.TimeoutException; import de.anomic.data.wiki.wikiCode; import de.anomic.data.wiki.wikiParser; -import de.anomic.document.Parser; -import de.anomic.document.ParserException; -import de.anomic.document.Document; /* * this class provides data structures to read a mediawiki dump file in xml format diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 44b90d47d..399cf2cce 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -55,6 +55,8 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; +import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.document.parser.xml.RSSReader; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -73,8 +75,6 @@ import org.apache.commons.httpclient.methods.multipart.Part; import de.anomic.crawler.ResultURLs; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.data.Blacklist; -import de.anomic.document.parser.xml.RSSFeed; -import de.anomic.document.parser.xml.RSSReader; import de.anomic.http.client.DefaultCharsetFilePart; import de.anomic.http.client.DefaultCharsetStringPart; import de.anomic.http.client.Client; diff --git a/source/de/anomic/yacy/yacyCore.java b/source/de/anomic/yacy/yacyCore.java index f75a44b55..adcf852a5 100644 --- a/source/de/anomic/yacy/yacyCore.java +++ b/source/de/anomic/yacy/yacyCore.java @@ -48,12 +48,12 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; -import de.anomic.content.RSSMessage; -import de.anomic.document.parser.xml.RSSFeed; import de.anomic.search.Switchboard; import de.anomic.server.serverCore; import de.anomic.server.serverSemaphore; diff --git a/source/de/anomic/yacy/yacyPeerActions.java b/source/de/anomic/yacy/yacyPeerActions.java index ed7b60793..1a995f05e 100644 --- a/source/de/anomic/yacy/yacyPeerActions.java +++ b/source/de/anomic/yacy/yacyPeerActions.java @@ -27,12 +27,12 @@ package de.anomic.yacy; import java.io.IOException; import java.util.HashMap; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.MapTools; -import de.anomic.content.RSSMessage; -import de.anomic.document.parser.xml.RSSFeed; public class yacyPeerActions { diff --git a/source/de/anomic/yacy/yacyRelease.java b/source/de/anomic/yacy/yacyRelease.java index 6d8806ac9..879404dfa 100644 --- a/source/de/anomic/yacy/yacyRelease.java +++ b/source/de/anomic/yacy/yacyRelease.java @@ -44,20 +44,21 @@ import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; +import net.yacy.document.parser.html.ContentScraper; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.FileUtils; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.document.parser.html.ContentScraper; +import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.http.server.ResponseContainer; import de.anomic.search.Switchboard; -import de.anomic.server.serverCharBuffer; import de.anomic.server.serverCore; import de.anomic.server.serverSystem; import de.anomic.tools.CryptoLib; @@ -234,7 +235,7 @@ public final class yacyRelease extends yacyVersion { // returns the version info if successful, null otherwise ContentScraper scraper; try { - scraper = ContentScraper.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CACHE_STRATEGY_NOCACHE); + scraper = LoaderDispatcher.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CACHE_STRATEGY_NOCACHE); } catch (final IOException e) { return null; } @@ -359,8 +360,8 @@ public final class yacyRelease extends yacyVersion { public boolean checkSignature() { if(releaseFile != null) { try { - serverCharBuffer signBuffer; - signBuffer = new serverCharBuffer(getSignatureFile()); + CharBuffer signBuffer; + signBuffer = new CharBuffer(getSignatureFile()); byte[] signByteBuffer = Base64Order.standardCoder.decode(signBuffer.toString().trim()); CryptoLib cl = new CryptoLib(); for(yacyUpdateLocation updateLocation : latestReleaseLocations) { diff --git a/source/de/anomic/ymage/WebStructureGraph.java b/source/de/anomic/ymage/WebStructureGraph.java index fb8d1362e..cbe607773 100644 --- a/source/de/anomic/ymage/WebStructureGraph.java +++ b/source/de/anomic/ymage/WebStructureGraph.java @@ -37,6 +37,8 @@ import java.util.SortedMap; import java.util.TreeMap; import java.util.TreeSet; +import net.yacy.document.Condenser; +import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; @@ -44,8 +46,6 @@ import net.yacy.kelondro.order.MicroDate; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; -import de.anomic.document.Condenser; -import de.anomic.document.Document; public class WebStructureGraph { @@ -58,7 +58,7 @@ public class WebStructureGraph { private final Log log; private final File rankingPath, structureFile; private final String crlFile, crgFile; - TreeMap structure_old, structure_new; // ',' to {}* + private TreeMap structure_old, structure_new; // ',' to {}* public WebStructureGraph(final Log log, final File rankingPath, final String crlFile, final String crgFile, final File structureFile) { this.log = log; diff --git a/source/de/anomic/ymage/ymageGraph.java b/source/de/anomic/ymage/ymageGraph.java index b70c16569..e9bd57836 100644 --- a/source/de/anomic/ymage/ymageGraph.java +++ b/source/de/anomic/ymage/ymageGraph.java @@ -43,6 +43,13 @@ public class ymageGraph { // a ymageGraph is a set of points and borders between the points // to reference the points, they must all have a nickname + public static final long color_back = 0xFFFFFF; + public static final long color_text = 0x888888; + private static final long color_dot = 0x11BB11; + private static final long color_line = 0x222222; + private static final long color_lineend = 0x333333; + + HashMap points; HashSet borders; double leftmost, rightmost, topmost, bottommost; @@ -124,12 +131,6 @@ public class ymageGraph { } } - public static final long color_back = 0xFFFFFF; - public static final long color_text = 0xAAAAAA; - private static final long color_dot = 0x11CC11; - private static final long color_line = 0x333333; - private static final long color_lineend = 0x666666; - public ymageMatrix draw(final int width, final int height, final int leftborder, final int rightborder, final int topborder, final int bottomborder) { final ymageMatrix image = new ymageMatrix(width, height, ymageMatrix.MODE_SUB, color_back); final double xfactor = ((rightmost - leftmost) == 0.0) ? 0.0 : (width - leftborder - rightborder) / (rightmost - leftmost); diff --git a/source/de/anomic/document/AbstractParser.java b/source/net/yacy/document/AbstractParser.java similarity index 93% rename from source/de/anomic/document/AbstractParser.java rename to source/net/yacy/document/AbstractParser.java index 0bdce9513..53e57db9b 100644 --- a/source/de/anomic/document/AbstractParser.java +++ b/source/net/yacy/document/AbstractParser.java @@ -23,7 +23,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document; +package net.yacy.document; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; @@ -39,7 +39,7 @@ import net.yacy.kelondro.workflow.WorkflowThread; /** - * New classes implementing the {@link de.anomic.document.Idiom} interface + * New classes implementing the {@link net.yacy.document.Idiom} interface * can extend this class to inherit all functions already implemented in this class. * @author Martin Thelian * @version $LastChangedRevision$ / $LastChangedDate$ @@ -148,7 +148,7 @@ public abstract class AbstractParser implements Idiom { * and some additional metadata. * @throws ParserException if the content could not be parsed properly * - * @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, byte[]) + * @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, byte[]) */ public Document parse( final DigestURI location, @@ -183,7 +183,7 @@ public abstract class AbstractParser implements Idiom { * and some additional metadata. * @throws ParserException if the content could not be parsed properly * - * @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.File) + * @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.File) */ public Document parse( final DigestURI location, @@ -218,7 +218,7 @@ public abstract class AbstractParser implements Idiom { * and some additional metadata. * @throws ParserException if the content could not be parsed properly * - * @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream) + * @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream) */ public abstract Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException; diff --git a/source/de/anomic/document/Classification.java b/source/net/yacy/document/Classification.java similarity index 99% rename from source/de/anomic/document/Classification.java rename to source/net/yacy/document/Classification.java index 5a319e5dd..7fe19d21e 100644 --- a/source/de/anomic/document/Classification.java +++ b/source/net/yacy/document/Classification.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document; +package net.yacy.document; import java.io.BufferedInputStream; import java.io.File; diff --git a/source/de/anomic/document/Condenser.java b/source/net/yacy/document/Condenser.java similarity index 97% rename from source/de/anomic/document/Condenser.java rename to source/net/yacy/document/Condenser.java index 0c395f4b6..d05eb9433 100644 --- a/source/de/anomic/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -23,7 +23,7 @@ // compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java // execute with java -cp source de.anomic.plasma.plasmaCondenser -package de.anomic.document; +package net.yacy.document; import java.io.BufferedReader; import java.io.ByteArrayInputStream; @@ -46,6 +46,9 @@ import java.util.Properties; import java.util.TreeMap; import java.util.TreeSet; +import net.yacy.document.language.Identificator; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -53,9 +56,6 @@ import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.util.SetTools; -import de.anomic.document.language.Identificator; -import de.anomic.document.parser.html.ContentScraper; -import de.anomic.document.parser.html.ImageEntry; public final class Condenser { diff --git a/source/de/anomic/document/Document.java b/source/net/yacy/document/Document.java similarity index 84% rename from source/de/anomic/document/Document.java rename to source/net/yacy/document/Document.java index f81c3e457..6901031c5 100644 --- a/source/de/anomic/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -21,7 +21,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document; +package net.yacy.document; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; @@ -43,17 +43,13 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.io.CachedFileOutputStream; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; -import de.anomic.document.parser.html.ContentScraper; -import de.anomic.document.parser.html.ImageEntry; -import de.anomic.http.client.Cache; -import de.anomic.http.client.Client; -import de.anomic.http.server.ResponseHeader; -import de.anomic.server.serverCachedFileOutputStream; -import de.anomic.ymage.WebStructureGraph; public class Document { @@ -104,7 +100,7 @@ public class Document { this.languages = languages; if (text == null) try { - this.text = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); + this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); } catch (final IOException e) { e.printStackTrace(); this.text = new StringBuilder(); @@ -134,7 +130,7 @@ public class Document { public Document(final DigestURI location, final String mimeType, final String charset, final Set languages, final String[] keywords, final String title, final String author, final String[] sections, final String abstrct, - final serverCachedFileOutputStream text, final Map anchors, final HashMap images) { + final CachedFileOutputStream text, final Map anchors, final HashMap images) { this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } @@ -251,8 +247,8 @@ dc_rights this.textStream = new BufferedInputStream(new FileInputStream((File)this.text)); } else if (this.text instanceof byte[]) { this.textStream = new ByteArrayInputStream((byte[])this.text); - } else if (this.text instanceof serverCachedFileOutputStream) { - return ((serverCachedFileOutputStream)this.text).getContent(); + } else if (this.text instanceof CachedFileOutputStream) { + return ((CachedFileOutputStream)this.text).getContent(); } return this.textStream; } catch (final Exception e) { @@ -269,8 +265,8 @@ dc_rights return FileUtils.read((File)this.text); } else if (this.text instanceof byte[]) { return (byte[])this.text; - } else if (this.text instanceof serverCachedFileOutputStream) { - final serverCachedFileOutputStream ffbaos = (serverCachedFileOutputStream)this.text; + } else if (this.text instanceof CachedFileOutputStream) { + final CachedFileOutputStream ffbaos = (CachedFileOutputStream)this.text; if (ffbaos.isFallback()) { return FileUtils.read(ffbaos.getContent()); } @@ -286,8 +282,8 @@ dc_rights if (this.text == null) return 0; if (this.text instanceof File) return ((File)this.text).length(); else if (this.text instanceof byte[]) return ((byte[])this.text).length; - else if (this.text instanceof serverCachedFileOutputStream) { - return ((serverCachedFileOutputStream)this.text).getLength(); + else if (this.text instanceof CachedFileOutputStream) { + return ((CachedFileOutputStream)this.text).getLength(); } return -1; @@ -525,11 +521,11 @@ dc_rights if (this.description.length() > 0) this.description.append('\n'); this.description.append(doc.dc_description()); - if (!(this.text instanceof serverCachedFileOutputStream)) { - this.text = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); - FileUtils.copy(getText(), (serverCachedFileOutputStream)this.text); + if (!(this.text instanceof CachedFileOutputStream)) { + this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); + FileUtils.copy(getText(), (CachedFileOutputStream)this.text); } - FileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text); + FileUtils.copy(doc.getText(), (CachedFileOutputStream)this.text); anchors.putAll(doc.getAnchors()); ContentScraper.addAllImages(images, doc.getImages()); @@ -549,12 +545,6 @@ dc_rights this.favicon = faviconURL; } - public void notifyWebStructure(final WebStructureGraph webStructure, final Condenser condenser, final Date docDate) { - final Integer[] ioLinks = webStructure.generateCitationReference(this, condenser, docDate); // [outlinksSame, outlinksOther] - this.inboundLinks = ioLinks[0].intValue(); - this.outboundLinks = ioLinks[1].intValue(); - } - public int inboundLinks() { return (this.inboundLinks < 0) ? 0 : this.inboundLinks; } @@ -608,61 +598,5 @@ dc_rights this.close(); super.finalize(); } - - /** - * Parse the resource - * @param url the URL of the resource - * @param contentLength the contentLength of the resource - * @param resourceStream the resource body as stream - * @param docInfo metadata about the resource - * @return the extracted data - * @throws ParserException - */ - public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream, ResponseHeader responseHeader) throws ParserException { - try { - if (resourceStream == null) return null; - - // STEP 1: if no resource metadata is available, try to load it from cache - if (responseHeader == null) { - // try to get the header from the htcache directory - try { - responseHeader = Cache.getResponseHeader(url); - } catch (final Exception e) { - // ignore this. resource info loading failed - } - } - - // STEP 2: if the metadata is still null try to download it from web - if ((responseHeader == null) && (url.getProtocol().startsWith("http"))) { - // TODO: we need a better solution here - // e.g. encapsulate this in the crawlLoader class - - // getting URL mimeType - try { - responseHeader = Client.whead(url.toString()); - } catch (final Exception e) { - // ingore this. http header download failed - } - } - - // STEP 3: if the metadata is still null try to guess the mimeType of the resource - String supportError = Parser.supports(url, responseHeader == null ? null : responseHeader.mime()); - if (supportError != null) { - return null; - } - if (responseHeader == null) { - return Parser.parseSource(url, null, null, contentLength, resourceStream); - } - return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream); - } catch (final InterruptedException e) { - // interruption of thread detected - return null; - } - } - - public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream) throws ParserException { - return parseDocument(url, contentLength, resourceStream, null); - } - } diff --git a/source/de/anomic/document/Idiom.java b/source/net/yacy/document/Idiom.java similarity index 96% rename from source/de/anomic/document/Idiom.java rename to source/net/yacy/document/Idiom.java index aa5d61c5f..37f99d6a6 100644 --- a/source/de/anomic/document/Idiom.java +++ b/source/net/yacy/document/Idiom.java @@ -23,7 +23,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document; +package net.yacy.document; import java.io.File; import java.io.InputStream; diff --git a/source/de/anomic/document/Parser.java b/source/net/yacy/document/Parser.java similarity index 94% rename from source/de/anomic/document/Parser.java rename to source/net/yacy/document/Parser.java index c86937528..5ea36e841 100644 --- a/source/de/anomic/document/Parser.java +++ b/source/net/yacy/document/Parser.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document; +package net.yacy.document; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; @@ -39,27 +39,27 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; +import net.yacy.document.parser.bzipParser; +import net.yacy.document.parser.docParser; +import net.yacy.document.parser.gzipParser; +import net.yacy.document.parser.htmlParser; +import net.yacy.document.parser.odtParser; +import net.yacy.document.parser.ooxmlParser; +import net.yacy.document.parser.pdfParser; +import net.yacy.document.parser.pptParser; +import net.yacy.document.parser.psParser; +import net.yacy.document.parser.rssParser; +import net.yacy.document.parser.rtfParser; +import net.yacy.document.parser.sevenzipParser; +import net.yacy.document.parser.swfParser; +import net.yacy.document.parser.tarParser; +import net.yacy.document.parser.vcfParser; +import net.yacy.document.parser.vsdParser; +import net.yacy.document.parser.xlsParser; +import net.yacy.document.parser.zipParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; -import de.anomic.document.parser.bzipParser; -import de.anomic.document.parser.docParser; -import de.anomic.document.parser.gzipParser; -import de.anomic.document.parser.htmlParser; -import de.anomic.document.parser.odtParser; -import de.anomic.document.parser.ooxmlParser; -import de.anomic.document.parser.pdfParser; -import de.anomic.document.parser.pptParser; -import de.anomic.document.parser.psParser; -import de.anomic.document.parser.rssParser; -import de.anomic.document.parser.rtfParser; -import de.anomic.document.parser.sevenzipParser; -import de.anomic.document.parser.swfParser; -import de.anomic.document.parser.tarParser; -import de.anomic.document.parser.vcfParser; -import de.anomic.document.parser.vsdParser; -import de.anomic.document.parser.xlsParser; -import de.anomic.document.parser.zipParser; public final class Parser { diff --git a/source/de/anomic/document/ParserException.java b/source/net/yacy/document/ParserException.java similarity index 95% rename from source/de/anomic/document/ParserException.java rename to source/net/yacy/document/ParserException.java index 9bbfa60ee..9a53f54e4 100644 --- a/source/de/anomic/document/ParserException.java +++ b/source/net/yacy/document/ParserException.java @@ -22,7 +22,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document; +package net.yacy.document; import net.yacy.kelondro.data.meta.DigestURI; diff --git a/source/de/anomic/document/Phrase.java b/source/net/yacy/document/Phrase.java similarity index 95% rename from source/de/anomic/document/Phrase.java rename to source/net/yacy/document/Phrase.java index 863adf1de..87d1d4ae5 100644 --- a/source/de/anomic/document/Phrase.java +++ b/source/net/yacy/document/Phrase.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document; +package net.yacy.document; import java.util.HashSet; diff --git a/source/de/anomic/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java similarity index 95% rename from source/de/anomic/content/DCEntry.java rename to source/net/yacy/document/content/DCEntry.java index 98bfcb200..bb07bcf01 100644 --- a/source/de/anomic/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -23,7 +23,7 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.content; +package net.yacy.document.content; import java.io.IOException; import java.io.OutputStreamWriter; @@ -36,10 +36,10 @@ import java.util.HashSet; import java.util.Locale; import java.util.TreeMap; +import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.DateFormatter; -import de.anomic.document.Document; public class DCEntry extends TreeMap { diff --git a/source/de/anomic/content/RSSMessage.java b/source/net/yacy/document/content/RSSMessage.java similarity index 99% rename from source/de/anomic/content/RSSMessage.java rename to source/net/yacy/document/content/RSSMessage.java index 9b45dcd1b..8a777d640 100644 --- a/source/de/anomic/content/RSSMessage.java +++ b/source/net/yacy/document/content/RSSMessage.java @@ -25,7 +25,7 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.content; +package net.yacy.document.content; import java.util.Date; import java.util.HashMap; diff --git a/source/de/anomic/content/dao/Dao.java b/source/net/yacy/document/content/dao/Dao.java similarity index 94% rename from source/de/anomic/content/dao/Dao.java rename to source/net/yacy/document/content/dao/Dao.java index 7c5305f2f..1455f14fc 100644 --- a/source/de/anomic/content/dao/Dao.java +++ b/source/net/yacy/document/content/dao/Dao.java @@ -22,14 +22,15 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.content.dao; +package net.yacy.document.content.dao; import java.io.File; import java.sql.SQLException; import java.util.Date; import java.util.concurrent.BlockingQueue; -import de.anomic.content.DCEntry; +import net.yacy.document.content.DCEntry; + /* * Database Access Objects are used to get a normalized view on database objects with java objects diff --git a/source/de/anomic/content/dao/DatabaseConnection.java b/source/net/yacy/document/content/dao/DatabaseConnection.java similarity index 98% rename from source/de/anomic/content/dao/DatabaseConnection.java rename to source/net/yacy/document/content/dao/DatabaseConnection.java index 8b6f8bf91..ea5ef4240 100644 --- a/source/de/anomic/content/dao/DatabaseConnection.java +++ b/source/net/yacy/document/content/dao/DatabaseConnection.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.content.dao; +package net.yacy.document.content.dao; import java.sql.Connection; import java.sql.DriverManager; diff --git a/source/de/anomic/content/dao/ImportDump.java b/source/net/yacy/document/content/dao/ImportDump.java similarity index 98% rename from source/de/anomic/content/dao/ImportDump.java rename to source/net/yacy/document/content/dao/ImportDump.java index 4e3e3d994..b0111a79b 100644 --- a/source/de/anomic/content/dao/ImportDump.java +++ b/source/net/yacy/document/content/dao/ImportDump.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.content.dao; +package net.yacy.document.content.dao; import java.io.ByteArrayOutputStream; import java.io.File; diff --git a/source/de/anomic/content/dao/PhpBB3Dao.java b/source/net/yacy/document/content/dao/PhpBB3Dao.java similarity index 96% rename from source/de/anomic/content/dao/PhpBB3Dao.java rename to source/net/yacy/document/content/dao/PhpBB3Dao.java index 5a822d3a0..44bc6db62 100644 --- a/source/de/anomic/content/dao/PhpBB3Dao.java +++ b/source/net/yacy/document/content/dao/PhpBB3Dao.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.content.dao; +package net.yacy.document.content.dao; import java.io.BufferedOutputStream; import java.io.File; @@ -39,9 +39,9 @@ import java.util.HashMap; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; +import net.yacy.document.content.DCEntry; import net.yacy.kelondro.data.meta.DigestURI; -import de.anomic.content.DCEntry; public class PhpBB3Dao implements Dao { diff --git a/source/de/anomic/content/file/SurrogateReader.java b/source/net/yacy/document/content/file/SurrogateReader.java similarity index 96% rename from source/de/anomic/content/file/SurrogateReader.java rename to source/net/yacy/document/content/file/SurrogateReader.java index 8bb0f15dd..0a9eda16e 100644 --- a/source/de/anomic/content/file/SurrogateReader.java +++ b/source/net/yacy/document/content/file/SurrogateReader.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.content.file; +package net.yacy.document.content.file; import java.io.BufferedInputStream; import java.io.File; @@ -36,11 +36,12 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; +import net.yacy.document.content.DCEntry; + import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import de.anomic.content.DCEntry; public class SurrogateReader extends DefaultHandler implements Runnable { diff --git a/source/de/anomic/document/detector/odtDetector.java b/source/net/yacy/document/detector/odtDetector.java similarity index 96% rename from source/de/anomic/document/detector/odtDetector.java rename to source/net/yacy/document/detector/odtDetector.java index ac42554b3..6ac9893e7 100644 --- a/source/de/anomic/document/detector/odtDetector.java +++ b/source/net/yacy/document/detector/odtDetector.java @@ -22,7 +22,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.detector; +package net.yacy.document.detector; import java.io.File; import java.io.IOException; diff --git a/source/de/anomic/document/detector/rssDetector.java b/source/net/yacy/document/detector/rssDetector.java similarity index 95% rename from source/de/anomic/document/detector/rssDetector.java rename to source/net/yacy/document/detector/rssDetector.java index 79063f20e..d67b10dfc 100644 --- a/source/de/anomic/document/detector/rssDetector.java +++ b/source/net/yacy/document/detector/rssDetector.java @@ -22,7 +22,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.detector; +package net.yacy.document.detector; import java.io.ByteArrayInputStream; import java.io.File; diff --git a/source/de/anomic/document/language/Identificator.java b/source/net/yacy/document/language/Identificator.java similarity index 99% rename from source/de/anomic/document/language/Identificator.java rename to source/net/yacy/document/language/Identificator.java index 80b0b3089..dddb6517f 100644 --- a/source/de/anomic/document/language/Identificator.java +++ b/source/net/yacy/document/language/Identificator.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.language; +package net.yacy.document.language; import java.util.HashMap; import java.util.Iterator; diff --git a/source/de/anomic/document/language/LanguageFilenameFilter.java b/source/net/yacy/document/language/LanguageFilenameFilter.java similarity index 97% rename from source/de/anomic/document/language/LanguageFilenameFilter.java rename to source/net/yacy/document/language/LanguageFilenameFilter.java index 8fb06cdd0..6ded9040b 100644 --- a/source/de/anomic/document/language/LanguageFilenameFilter.java +++ b/source/net/yacy/document/language/LanguageFilenameFilter.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.language; +package net.yacy.document.language; import java.io.File; import java.io.FilenameFilter; diff --git a/source/de/anomic/document/language/LanguageStatistics.java b/source/net/yacy/document/language/LanguageStatistics.java similarity index 99% rename from source/de/anomic/document/language/LanguageStatistics.java rename to source/net/yacy/document/language/LanguageStatistics.java index b5348c525..76ba979d4 100644 --- a/source/de/anomic/document/language/LanguageStatistics.java +++ b/source/net/yacy/document/language/LanguageStatistics.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.language; +package net.yacy.document.language; import java.io.BufferedReader; import java.io.File; diff --git a/source/de/anomic/document/language/LanguageStatisticsHolder.java b/source/net/yacy/document/language/LanguageStatisticsHolder.java similarity index 98% rename from source/de/anomic/document/language/LanguageStatisticsHolder.java rename to source/net/yacy/document/language/LanguageStatisticsHolder.java index ccdda4bf4..3de650a0d 100644 --- a/source/de/anomic/document/language/LanguageStatisticsHolder.java +++ b/source/net/yacy/document/language/LanguageStatisticsHolder.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.language; +package net.yacy.document.language; import java.io.File; import java.io.FilenameFilter; diff --git a/source/de/anomic/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java similarity index 92% rename from source/de/anomic/document/parser/bzipParser.java rename to source/net/yacy/document/parser/bzipParser.java index 2419cf971..c85b45ee1 100644 --- a/source/de/anomic/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -25,7 +25,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.File; import java.io.FileOutputStream; @@ -33,16 +33,16 @@ import java.io.InputStream; import java.util.HashSet; import java.util.Set; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.Parser; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import org.apache.tools.bzip2.CBZip2InputStream; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.Parser; -import de.anomic.document.ParserException; -import de.anomic.document.Document; public class bzipParser extends AbstractParser implements Idiom { diff --git a/source/de/anomic/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java similarity index 93% rename from source/de/anomic/document/parser/docParser.java rename to source/net/yacy/document/parser/docParser.java index 6f73d5005..c5e71aa48 100644 --- a/source/de/anomic/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -25,17 +25,17 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.HashSet; import java.util.Set; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import org.apache.poi.hwpf.extractor.WordExtractor; diff --git a/source/de/anomic/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java similarity index 92% rename from source/de/anomic/document/parser/gzipParser.java rename to source/net/yacy/document/parser/gzipParser.java index d99cae6d6..10b528187 100644 --- a/source/de/anomic/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -25,7 +25,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.File; import java.io.FileOutputStream; @@ -34,14 +34,14 @@ import java.util.HashSet; import java.util.Set; import java.util.zip.GZIPInputStream; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.Parser; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.Parser; -import de.anomic.document.ParserException; -import de.anomic.document.Document; public class gzipParser extends AbstractParser implements Idiom { diff --git a/source/de/anomic/document/parser/html/AbstractScraper.java b/source/net/yacy/document/parser/html/AbstractScraper.java similarity index 95% rename from source/de/anomic/document/parser/html/AbstractScraper.java rename to source/net/yacy/document/parser/html/AbstractScraper.java index b9ccfe244..bd8a515c0 100644 --- a/source/de/anomic/document/parser/html/AbstractScraper.java +++ b/source/net/yacy/document/parser/html/AbstractScraper.java @@ -27,7 +27,7 @@ // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import java.util.HashSet; import java.util.Properties; diff --git a/source/de/anomic/document/parser/html/AbstractTransformer.java b/source/net/yacy/document/parser/html/AbstractTransformer.java similarity index 95% rename from source/de/anomic/document/parser/html/AbstractTransformer.java rename to source/net/yacy/document/parser/html/AbstractTransformer.java index ade4d8000..16e39ab52 100644 --- a/source/de/anomic/document/parser/html/AbstractTransformer.java +++ b/source/net/yacy/document/parser/html/AbstractTransformer.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import java.util.Properties; import java.util.TreeSet; diff --git a/source/de/anomic/document/parser/html/CharacterCoding.java b/source/net/yacy/document/parser/html/CharacterCoding.java similarity index 99% rename from source/de/anomic/document/parser/html/CharacterCoding.java rename to source/net/yacy/document/parser/html/CharacterCoding.java index 82225679e..ac98ef929 100644 --- a/source/de/anomic/document/parser/html/CharacterCoding.java +++ b/source/net/yacy/document/parser/html/CharacterCoding.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import java.util.HashMap; diff --git a/source/de/anomic/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java similarity index 92% rename from source/de/anomic/document/parser/html/ContentScraper.java rename to source/net/yacy/document/parser/html/ContentScraper.java index 42c3d2f25..394011a4f 100644 --- a/source/de/anomic/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import java.io.ByteArrayInputStream; import java.io.File; @@ -43,14 +43,12 @@ import java.util.Properties; import javax.swing.event.EventListenerList; +import net.yacy.document.parser.htmlParser; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.ISO639; -import de.anomic.crawler.retrieval.LoaderDispatcher; -import de.anomic.crawler.retrieval.Response; -import de.anomic.document.parser.htmlParser; -import de.anomic.server.serverCharBuffer; public class ContentScraper extends AbstractScraper implements Scraper { @@ -84,7 +82,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private String title; //private String headline; private List[] headlines; - private serverCharBuffer content; + private CharBuffer content; private final EventListenerList htmlFilterEventListeners; /** @@ -109,7 +107,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.title = ""; this.headlines = new ArrayList[4]; for (int i = 0; i < 4; i++) headlines[i] = new ArrayList(); - this.content = new serverCharBuffer(1024); + this.content = new CharBuffer(1024); this.htmlFilterEventListeners = new EventListenerList(); } @@ -504,20 +502,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { return scraper; } - public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException { - // load page - Response r = loader.load(location, true, false, cachePolicy); - byte[] page = (r == null) ? null : r.getContent(); - if (page == null) throw new IOException("no response from url " + location.toString()); - - // scrape content - final ContentScraper scraper = new ContentScraper(location); - final Writer writer = new TransformerWriter(null, null, scraper, null, false); - writer.write(new String(page, "UTF-8")); - - return scraper; - } - public static void addAllImages(final HashMap a, final HashMap b) { final Iterator> i = b.entrySet().iterator(); Map.Entry ie; diff --git a/source/de/anomic/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java similarity index 61% rename from source/de/anomic/document/parser/html/ContentTransformer.java rename to source/net/yacy/document/parser/html/ContentTransformer.java index dcbc894a2..2fd53cf1a 100644 --- a/source/de/anomic/document/parser/html/ContentTransformer.java +++ b/source/net/yacy/document/parser/html/ContentTransformer.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import java.io.BufferedReader; import java.io.File; @@ -30,18 +30,48 @@ import java.io.FileReader; import java.io.IOException; import java.text.Collator; import java.util.ArrayList; +import java.util.List; import java.util.Locale; import java.util.Properties; import java.util.TreeSet; +import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.ByteBuffer; -import de.anomic.http.server.TemplateEngine; -import de.anomic.server.serverCharBuffer; - public class ContentTransformer extends AbstractTransformer implements Transformer { - // statics: for initialisation of the HTMLFilterAbstractTransformer + public final static byte hashChar = (byte)'#'; + public final static byte[] slashChar = {(byte)'/'}; + public final static byte pcChar = (byte)'%'; + public final static byte[] dpdpa = "::".getBytes(); + + public final static byte lbr = (byte)'['; + public final static byte rbr = (byte)']'; + public final static byte[] pOpen = {hashChar, lbr}; + public final static byte[] pClose = {rbr, hashChar}; + + public final static byte lcbr = (byte)'{'; + public final static byte rcbr = (byte)'}'; + public final static byte[] mOpen = {hashChar, lcbr}; + public final static byte[] mClose = {rcbr, hashChar}; + + public final static byte lrbr = (byte)'('; + public final static byte rrbr = (byte)')'; + public final static byte[] aOpen = {hashChar, lrbr}; + public final static byte[] aClose = {rrbr, hashChar}; + + public final static byte[] iOpen = {hashChar, pcChar}; + public final static byte[] iClose = {pcChar, hashChar}; + + + private final static Object[] meta_quotation = new Object[] { + new Object[] {pOpen, pClose}, + new Object[] {mOpen, mClose}, + new Object[] {aOpen, aClose}, + new Object[] {iOpen, iClose} + }; + + // statics: for initialization of the HTMLFilterAbstractTransformer private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); private static final TreeSet linkTags0 = new TreeSet(insensitiveCollator);; private static final TreeSet linkTags1 = new TreeSet(insensitiveCollator);; @@ -89,7 +119,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform } private static char[] genBlueLetters(int length) { - final serverCharBuffer bb = new serverCharBuffer(" ".toCharArray()); + final CharBuffer bb = new CharBuffer(" ".toCharArray()); length = length / 2; if (length > 10) length = 7; while (length-- > 0) { @@ -118,13 +148,13 @@ public class ContentTransformer extends AbstractTransformer implements Transform final ArrayList result = new ArrayList(); final ByteBuffer sbb = new ByteBuffer(text); - final ByteBuffer[] sbbs = TemplateEngine.splitQuotations(sbb); + final ByteBuffer[] sbbs = splitQuotations(sbb); for (int i = 0; i < sbbs.length; i++) { // TODO: avoid empty if statements if (sbbs[i].isWhitespace(true)) { //sbb.append(sbbs[i]); - } else if ((sbbs[i].byteAt(0) == TemplateEngine.hash) || - (sbbs[i].startsWith(TemplateEngine.dpdpa))) { + } else if ((sbbs[i].byteAt(0) == hashChar) || + (sbbs[i].startsWith(dpdpa))) { // this is a template or a part of a template //sbb.append(sbbs[i]); } else { @@ -136,6 +166,52 @@ public class ContentTransformer extends AbstractTransformer implements Transform } return result; } + + public final static ByteBuffer[] splitQuotations(final ByteBuffer text) { + final List l = splitQuotation(text, 0); + final ByteBuffer[] sbbs = new ByteBuffer[l.size()]; + for (int i = 0; i < l.size(); i++) sbbs[i] = l.get(i); + return sbbs; + } + + private final static List splitQuotation(ByteBuffer text, int qoff) { + final ArrayList l = new ArrayList(); + if (qoff >= meta_quotation.length) { + if (text.length() > 0) l.add(text); + return l; + } + int p = -1, q; + final byte[] left = (byte[]) ((Object[]) meta_quotation[qoff])[0]; + final byte[] right = (byte[]) ((Object[]) meta_quotation[qoff])[1]; + qoff++; + while ((text.length() > 0) && ((p = text.indexOf(left)) >= 0)) { + q = text.indexOf(right, p + 1); + if (q >= 0) { + // found a pattern + l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff)); + l.add(new ByteBuffer(text.getBytes(p, q + right.length - p))); + text = new ByteBuffer(text.getBytes(q + right.length)); + } else { + // found only pattern start, no closing parantesis (a syntax error that is silently accepted here) + l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff)); + l.addAll(splitQuotation(new ByteBuffer(text.getBytes(p)), qoff)); + text.clear(); + } + } + + // find double-points + while ((text.length() > 0) && ((p = text.indexOf(dpdpa)) >= 0)) { + l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff)); + l.add(new ByteBuffer(dpdpa)); + l.addAll(splitQuotation(new ByteBuffer(text.getBytes(p + 2)), qoff)); + text.clear(); + } + + // add remaining + if (text.length() > 0) l.addAll(splitQuotation(text, qoff)); + return l; + } + public char[] transformText(final char[] text) { if (bluelist != null) { if (bluelistHit(text)) { diff --git a/source/de/anomic/document/parser/html/ImageEntry.java b/source/net/yacy/document/parser/html/ImageEntry.java similarity index 96% rename from source/de/anomic/document/parser/html/ImageEntry.java rename to source/net/yacy/document/parser/html/ImageEntry.java index 76d242cea..b7bf1cc20 100644 --- a/source/de/anomic/document/parser/html/ImageEntry.java +++ b/source/net/yacy/document/parser/html/ImageEntry.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import net.yacy.kelondro.data.meta.DigestURI; diff --git a/source/de/anomic/document/parser/html/Scraper.java b/source/net/yacy/document/parser/html/Scraper.java similarity index 94% rename from source/de/anomic/document/parser/html/Scraper.java rename to source/net/yacy/document/parser/html/Scraper.java index 82820ce69..730c56478 100644 --- a/source/de/anomic/document/parser/html/Scraper.java +++ b/source/net/yacy/document/parser/html/Scraper.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import java.util.Properties; diff --git a/source/de/anomic/document/parser/html/ScraperInputStream.java b/source/net/yacy/document/parser/html/ScraperInputStream.java similarity index 96% rename from source/de/anomic/document/parser/html/ScraperInputStream.java rename to source/net/yacy/document/parser/html/ScraperInputStream.java index 335953da1..0316abffb 100644 --- a/source/de/anomic/document/parser/html/ScraperInputStream.java +++ b/source/net/yacy/document/parser/html/ScraperInputStream.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import java.io.BufferedInputStream; import java.io.IOException; diff --git a/source/de/anomic/document/parser/html/ScraperListener.java b/source/net/yacy/document/parser/html/ScraperListener.java similarity index 94% rename from source/de/anomic/document/parser/html/ScraperListener.java rename to source/net/yacy/document/parser/html/ScraperListener.java index 3c8bf0b0d..8ac079797 100644 --- a/source/de/anomic/document/parser/html/ScraperListener.java +++ b/source/net/yacy/document/parser/html/ScraperListener.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import java.util.Properties; diff --git a/source/de/anomic/document/parser/html/Transformer.java b/source/net/yacy/document/parser/html/Transformer.java similarity index 95% rename from source/de/anomic/document/parser/html/Transformer.java rename to source/net/yacy/document/parser/html/Transformer.java index ef40c91df..2aedfa120 100644 --- a/source/de/anomic/document/parser/html/Transformer.java +++ b/source/net/yacy/document/parser/html/Transformer.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import java.util.Properties; diff --git a/source/de/anomic/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java similarity index 93% rename from source/de/anomic/document/parser/html/TransformerWriter.java rename to source/net/yacy/document/parser/html/TransformerWriter.java index f91b6c12f..81079d73b 100644 --- a/source/de/anomic/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -29,7 +29,7 @@ */ -package de.anomic.document.parser.html; +package net.yacy.document.parser.html; import java.io.File; import java.io.FileOutputStream; @@ -45,8 +45,8 @@ import java.util.Enumeration; import java.util.Properties; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.io.CharBuffer; -import de.anomic.server.serverCharBuffer; public final class TransformerWriter extends Writer { @@ -59,10 +59,10 @@ public final class TransformerWriter extends Writer { private final OutputStream outStream; private OutputStreamWriter out; - private serverCharBuffer buffer; + private CharBuffer buffer; private String filterTag; private Properties filterOpts; - private serverCharBuffer filterCont; + private CharBuffer filterCont; private final Scraper scraper; private final Transformer transformer; private boolean inSingleQuote; @@ -83,7 +83,7 @@ public final class TransformerWriter extends Writer { this.outStream = outStream; this.scraper = scraper; this.transformer = transformer; - this.buffer = new serverCharBuffer(1024); + this.buffer = new CharBuffer(1024); this.filterTag = null; this.filterOpts = null; this.filterCont = null; @@ -101,7 +101,7 @@ public final class TransformerWriter extends Writer { } public static char[] genTag0raw(final String tagname, final boolean opening, final char[] tagopts) { - final serverCharBuffer bb = new serverCharBuffer(tagname.length() + tagopts.length + 3); + final CharBuffer bb = new CharBuffer(tagname.length() + tagopts.length + 3); bb.append((int)'<'); if (!opening) { bb.append((int)'/'); @@ -123,7 +123,7 @@ public final class TransformerWriter extends Writer { } public static char[] genTag1raw(final String tagname, final char[] tagopts, final char[] text) { - final serverCharBuffer bb = new serverCharBuffer(2 * tagname.length() + tagopts.length + text.length + 5); + final CharBuffer bb = new CharBuffer(2 * tagname.length() + tagopts.length + text.length + 5); bb.append((int)'<').append(tagname); if (tagopts.length > 0) { // if (tagopts[0] == (byte) 32) @@ -144,7 +144,7 @@ public final class TransformerWriter extends Writer { public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) { final char[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar); - final serverCharBuffer bb = new serverCharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2); + final CharBuffer bb = new CharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2); bb.append((int)'<').append(tagname); if (tagoptsx != null) { bb.append(32); @@ -162,7 +162,7 @@ public final class TransformerWriter extends Writer { public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) { final char[] gt0 = genTag0(tagname, tagopts, quotechar); - final serverCharBuffer cb = new serverCharBuffer(gt0, gt0.length + text.length + tagname.length() + 3); + final CharBuffer cb = new CharBuffer(gt0, gt0.length + text.length + tagname.length() + 3); cb.append(text).append((int)'<').append((int)'/').append(tagname).append((int)'>'); final char[] result = cb.getChars(); try { @@ -176,7 +176,7 @@ public final class TransformerWriter extends Writer { // a helper method for pretty-printing of properties for html tags public static char[] genOpts(final Properties prop, final char quotechar) { final Enumeration e = prop.propertyNames(); - final serverCharBuffer bb = new serverCharBuffer(prop.size() * 40); + final CharBuffer bb = new CharBuffer(prop.size() * 40); String key; while (e.hasMoreElements()) { key = (String) e.nextElement(); @@ -212,7 +212,7 @@ public final class TransformerWriter extends Writer { if (opening) { if ((scraper != null) && (scraper.isTag0(tag))) { // this single tag is collected at once here - final serverCharBuffer charBuffer = new serverCharBuffer(content); + final CharBuffer charBuffer = new CharBuffer(content); scraper.scrapeTag0(tag, charBuffer.propParser()); try { charBuffer.close(); @@ -223,7 +223,7 @@ public final class TransformerWriter extends Writer { } if ((transformer != null) && (transformer.isTag0(tag))) { // this single tag is collected at once here - final serverCharBuffer scb = new serverCharBuffer(content); + final CharBuffer scb = new CharBuffer(content); try { return transformer.transformTag0(tag, scb.propParser(), quotechar); } finally { @@ -237,14 +237,14 @@ public final class TransformerWriter extends Writer { ((transformer != null) && (transformer.isTag1(tag)))) { // ok, start collecting filterTag = tag; - final serverCharBuffer scb = new serverCharBuffer(content); + final CharBuffer scb = new CharBuffer(content); filterOpts = scb.propParser(); try { scb.close(); } catch (IOException e) { e.printStackTrace(); } - filterCont = new serverCharBuffer(); + filterCont = new CharBuffer(); return new char[0]; } else { // we ignore that thing and return it again diff --git a/source/de/anomic/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java similarity index 96% rename from source/de/anomic/document/parser/htmlParser.java rename to source/net/yacy/document/parser/htmlParser.java index fce0aa61a..572d17908 100644 --- a/source/de/anomic/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.IOException; import java.io.InputStream; @@ -34,16 +34,16 @@ import java.nio.charset.UnsupportedCharsetException; import java.util.HashSet; import java.util.Set; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.ScraperInputStream; +import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; -import de.anomic.document.AbstractParser; -import de.anomic.document.Document; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.parser.html.ContentScraper; -import de.anomic.document.parser.html.ScraperInputStream; -import de.anomic.document.parser.html.TransformerWriter; public class htmlParser extends AbstractParser implements Idiom { diff --git a/source/de/anomic/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java similarity index 83% rename from source/de/anomic/document/parser/odtParser.java rename to source/net/yacy/document/parser/odtParser.java index db635392a..ca2daba0a 100644 --- a/source/de/anomic/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -25,9 +25,8 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; -import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; @@ -42,21 +41,17 @@ import java.util.zip.ZipFile; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; +import net.yacy.document.parser.xml.ODContentHandler; +import net.yacy.document.parser.xml.ODMetaHandler; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; -import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; -import de.anomic.document.parser.xml.ODContentHandler; -import de.anomic.document.parser.xml.ODMetaHandler; -import de.anomic.http.client.Client; -import de.anomic.http.server.HeaderFramework; -import de.anomic.http.server.RequestHeader; -import de.anomic.server.serverCharBuffer; public class odtParser extends AbstractParser implements Idiom { @@ -148,7 +143,7 @@ public class odtParser extends AbstractParser implements Idiom { writerFile = File.createTempFile("odtParser",".prt"); writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); } else { - writer = new serverCharBuffer(); + writer = new CharBuffer(); } // extract data @@ -192,8 +187,8 @@ public class odtParser extends AbstractParser implements Idiom { // create the parser document Document theDoc = null; - if (writer instanceof serverCharBuffer) { - final byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8"); + if (writer instanceof CharBuffer) { + final byte[] contentBytes = ((CharBuffer)writer).toString().getBytes("UTF-8"); theDoc = new Document( location, mimeType, @@ -264,27 +259,4 @@ public class odtParser extends AbstractParser implements Idiom { // Nothing todo here at the moment super.reset(); } - - public static void main(final String[] args) { - try { - if (args.length != 1) return; - - // getting the content URL - final DigestURI contentUrl = new DigestURI(args[0], null); - - // creating a new parser - final odtParser testParser = new odtParser(); - - // downloading the document content - final RequestHeader reqHeader = new RequestHeader(); - reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); - final byte[] content = Client.wget(contentUrl.toString(), reqHeader, 10000); - final ByteArrayInputStream input = new ByteArrayInputStream(content); - - // parsing the document - testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input); - } catch (final Exception e) { - e.printStackTrace(); - } - } } diff --git a/source/de/anomic/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java similarity index 85% rename from source/de/anomic/document/parser/ooxmlParser.java rename to source/net/yacy/document/parser/ooxmlParser.java index 77a7901d9..228c18856 100644 --- a/source/de/anomic/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -25,9 +25,8 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; -import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; @@ -42,21 +41,17 @@ import java.util.zip.ZipFile; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; +import net.yacy.document.parser.xml.ODContentHandler; +import net.yacy.document.parser.xml.ODMetaHandler; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; -import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; -import de.anomic.document.parser.xml.ODContentHandler; -import de.anomic.document.parser.xml.ODMetaHandler; -import de.anomic.http.client.Client; -import de.anomic.http.server.HeaderFramework; -import de.anomic.http.server.RequestHeader; -import de.anomic.server.serverCharBuffer; public class ooxmlParser extends AbstractParser implements Idiom { @@ -133,7 +128,7 @@ public class ooxmlParser extends AbstractParser implements Idiom { writerFile = File.createTempFile("ooxmlParser",".prt"); writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); } else { - writer = new serverCharBuffer(); + writer = new CharBuffer(); } // extract data @@ -178,8 +173,8 @@ public class ooxmlParser extends AbstractParser implements Idiom { // create the parser document Document theDoc = null; - if (writer instanceof serverCharBuffer) { - final byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8"); + if (writer instanceof CharBuffer) { + final byte[] contentBytes = ((CharBuffer)writer).toString().getBytes("UTF-8"); theDoc = new Document( location, mimeType, @@ -250,27 +245,4 @@ public class ooxmlParser extends AbstractParser implements Idiom { // Nothing todo here at the moment super.reset(); } - - public static void main(final String[] args) { - try { - if (args.length != 1) return; - - // getting the content URL - final DigestURI contentUrl = new DigestURI(args[0], null); - - // creating a new parser - final odtParser testParser = new odtParser(); - - // downloading the document content - final RequestHeader reqHeader = new RequestHeader(); - reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); - final byte[] content = Client.wget(contentUrl.toString(), reqHeader, 10000); - final ByteArrayInputStream input = new ByteArrayInputStream(content); - - // parsing the document - testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input); - } catch (final Exception e) { - e.printStackTrace(); - } - } } diff --git a/source/de/anomic/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java similarity index 93% rename from source/de/anomic/document/parser/pdfParser.java rename to source/net/yacy/document/parser/pdfParser.java index bb446a4da..054e0eece 100644 --- a/source/de/anomic/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -25,7 +25,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.File; import java.io.FileOutputStream; @@ -43,15 +43,15 @@ import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; import org.apache.pdfbox.util.PDFTextStripper; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; -import de.anomic.server.serverCharBuffer; public class pdfParser extends AbstractParser implements Idiom { @@ -133,7 +133,7 @@ public class pdfParser extends AbstractParser implements Idiom { writerFile = File.createTempFile("pdfParser",".prt"); writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); } else { - writer = new serverCharBuffer(); + writer = new CharBuffer(); } try { stripper.writeText(theDocument, writer ); // may throw a NPE @@ -149,8 +149,8 @@ public class pdfParser extends AbstractParser implements Idiom { Document theDoc = null; - if (writer instanceof serverCharBuffer) { - final byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8"); + if (writer instanceof CharBuffer) { + final byte[] contentBytes = ((CharBuffer)writer).toString().getBytes("UTF-8"); theDoc = new Document( location, mimeType, diff --git a/source/de/anomic/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java similarity index 93% rename from source/de/anomic/document/parser/pptParser.java rename to source/net/yacy/document/parser/pptParser.java index 9b88f6f30..52e956b77 100644 --- a/source/de/anomic/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -25,21 +25,21 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.BufferedInputStream; import java.io.InputStream; import java.util.HashSet; import java.util.Set; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import org.apache.poi.hslf.extractor.PowerPointExtractor; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; public class pptParser extends AbstractParser implements Idiom { diff --git a/source/de/anomic/document/parser/psParser.java b/source/net/yacy/document/parser/psParser.java similarity index 95% rename from source/de/anomic/document/parser/psParser.java rename to source/net/yacy/document/parser/psParser.java index 47264820f..fb704a865 100644 --- a/source/de/anomic/document/parser/psParser.java +++ b/source/net/yacy/document/parser/psParser.java @@ -25,7 +25,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -37,13 +37,13 @@ import java.io.InputStreamReader; import java.util.HashSet; import java.util.Set; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; public class psParser extends AbstractParser implements Idiom { diff --git a/source/de/anomic/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java similarity index 89% rename from source/de/anomic/document/parser/rssParser.java rename to source/net/yacy/document/parser/rssParser.java index fad11eef5..2344f973c 100644 --- a/source/de/anomic/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -25,7 +25,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -38,22 +38,22 @@ import java.util.LinkedList; import java.util.Map; import java.util.Set; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; +import net.yacy.document.content.RSSMessage; +import net.yacy.document.parser.html.AbstractScraper; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.ImageEntry; +import net.yacy.document.parser.html.TransformerWriter; +import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.document.parser.xml.RSSReader; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; -import de.anomic.content.RSSMessage; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; -import de.anomic.document.parser.html.AbstractScraper; -import de.anomic.document.parser.html.ContentScraper; -import de.anomic.document.parser.html.ImageEntry; -import de.anomic.document.parser.html.TransformerWriter; -import de.anomic.document.parser.xml.RSSFeed; -import de.anomic.document.parser.xml.RSSReader; -import de.anomic.server.serverCharBuffer; public class rssParser extends AbstractParser implements Idiom { @@ -84,7 +84,7 @@ public class rssParser extends AbstractParser implements Idiom { final HashMap anchors = new HashMap(); final HashMap images = new HashMap(); final ByteBuffer text = new ByteBuffer(); - final serverCharBuffer authors = new serverCharBuffer(); + final CharBuffer authors = new CharBuffer(); final RSSFeed feed = new RSSReader(source).getFeed(); if (feed == null) throw new ParserException("no feed in document",location); diff --git a/source/de/anomic/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java similarity index 92% rename from source/de/anomic/document/parser/rtfParser.java rename to source/net/yacy/document/parser/rtfParser.java index 838b6a845..2271a1100 100644 --- a/source/de/anomic/document/parser/rtfParser.java +++ b/source/net/yacy/document/parser/rtfParser.java @@ -25,7 +25,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.InputStream; import java.util.HashSet; @@ -34,12 +34,12 @@ import java.util.Set; import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; public class rtfParser extends AbstractParser implements Idiom { diff --git a/source/de/anomic/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java similarity index 95% rename from source/de/anomic/document/parser/sevenzipParser.java rename to source/net/yacy/document/parser/sevenzipParser.java index 707d5be5d..6ef535666 100644 --- a/source/de/anomic/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -25,7 +25,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.ByteArrayInputStream; import java.io.File; @@ -35,7 +35,13 @@ import java.io.OutputStream; import java.util.HashSet; import java.util.Set; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.Parser; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.io.CachedFileOutputStream; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -45,12 +51,6 @@ import SevenZip.MyRandomAccessFile; import SevenZip.Archive.IInArchive; import SevenZip.Archive.SevenZipEntry; import SevenZip.Archive.SevenZip.Handler; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.Parser; -import de.anomic.document.ParserException; -import de.anomic.document.Document; -import de.anomic.server.serverCachedFileOutputStream; public class sevenzipParser extends AbstractParser implements Idiom { @@ -118,7 +118,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { - final serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); + final CachedFileOutputStream cfos = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); FileUtils.copy(source, cfos); if (cfos.isFallback()) { return parse(location, mimeType, charset, cfos.getContentFile()); @@ -144,7 +144,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { private final Log log; private final long maxRamSize; - private serverCachedFileOutputStream cfos = null; + private CachedFileOutputStream cfos = null; private final Document doc; private final String prefix; @@ -228,7 +228,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { throw ex; } this.cfos = (item.isDirectory()) ? null - : new serverCachedFileOutputStream(this.maxRamSize, null, true, item.getSize()); + : new CachedFileOutputStream(this.maxRamSize, null, true, item.getSize()); return this.cfos; } diff --git a/source/de/anomic/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java similarity index 96% rename from source/de/anomic/document/parser/swfParser.java rename to source/net/yacy/document/parser/swfParser.java index 361ad344a..d18f2a0b2 100644 --- a/source/de/anomic/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -25,7 +25,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.IOException; import java.io.InputStream; @@ -33,13 +33,13 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Set; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import pt.tumba.parser.swf.SWF2HTML; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; public class swfParser extends AbstractParser implements Idiom { diff --git a/source/de/anomic/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java similarity index 94% rename from source/de/anomic/document/parser/tarParser.java rename to source/net/yacy/document/parser/tarParser.java index 1e39683e2..202292f80 100644 --- a/source/de/anomic/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -25,7 +25,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.BufferedOutputStream; import java.io.File; @@ -40,6 +40,13 @@ import java.util.Map; import java.util.Set; import java.util.zip.GZIPInputStream; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.Parser; +import net.yacy.document.ParserException; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; @@ -47,13 +54,6 @@ import net.yacy.kelondro.util.FileUtils; import org.apache.tools.tar.TarEntry; import org.apache.tools.tar.TarInputStream; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.Parser; -import de.anomic.document.ParserException; -import de.anomic.document.Document; -import de.anomic.document.parser.html.ContentScraper; -import de.anomic.document.parser.html.ImageEntry; public class tarParser extends AbstractParser implements Idiom { diff --git a/source/de/anomic/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java similarity index 90% rename from source/de/anomic/document/parser/vcfParser.java rename to source/net/yacy/document/parser/vcfParser.java index 1234dec90..cb5cd1be1 100644 --- a/source/de/anomic/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -25,10 +25,9 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.BufferedReader; -import java.io.ByteArrayInputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; @@ -38,18 +37,13 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.Set; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.order.Base64Order; -import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; -import de.anomic.http.client.Client; -import de.anomic.http.server.HeaderFramework; -import de.anomic.http.server.RequestHeader; - /** * Vcard specification: http://www.imc.org/pdi/vcard-21.txt * @author theli @@ -279,18 +273,4 @@ public class vcfParser extends AbstractParser implements Idiom { return sb.toString(); } - public static void main(final String[] args) { - try { - final DigestURI contentUrl = new DigestURI(args[0], null); - - final vcfParser testParser = new vcfParser(); - final RequestHeader reqHeader = new RequestHeader(); - reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); - final byte[] content = Client.wget(contentUrl.toString(), reqHeader, 10000); - final ByteArrayInputStream input = new ByteArrayInputStream(content); - testParser.parse(contentUrl, "text/x-vcard", "UTF-8",input); - } catch (final Exception e) { - e.printStackTrace(); - } - } } diff --git a/source/de/anomic/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java similarity index 96% rename from source/de/anomic/document/parser/vsdParser.java rename to source/net/yacy/document/parser/vsdParser.java index 6e6832ae6..e3fdea5da 100644 --- a/source/de/anomic/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -25,16 +25,16 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.InputStream; import java.util.HashSet; import java.util.Set; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; diff --git a/source/de/anomic/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java similarity index 94% rename from source/de/anomic/document/parser/xlsParser.java rename to source/net/yacy/document/parser/xlsParser.java index 61373e422..ba4cb82d8 100644 --- a/source/de/anomic/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -25,12 +25,16 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.InputStream; import java.util.HashSet; import java.util.Set; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; @@ -42,10 +46,6 @@ import org.apache.poi.hssf.record.Record; import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.ParserException; -import de.anomic.document.Document; public class xlsParser extends AbstractParser implements Idiom, HSSFListener { diff --git a/source/de/anomic/document/parser/xml/ODContentHandler.java b/source/net/yacy/document/parser/xml/ODContentHandler.java similarity index 97% rename from source/de/anomic/document/parser/xml/ODContentHandler.java rename to source/net/yacy/document/parser/xml/ODContentHandler.java index e22cc9e72..1977a094c 100644 --- a/source/de/anomic/document/parser/xml/ODContentHandler.java +++ b/source/net/yacy/document/parser/xml/ODContentHandler.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.xml; +package net.yacy.document.parser.xml; import java.io.IOException; import java.io.Writer; diff --git a/source/de/anomic/document/parser/xml/ODMetaHandler.java b/source/net/yacy/document/parser/xml/ODMetaHandler.java similarity index 98% rename from source/de/anomic/document/parser/xml/ODMetaHandler.java rename to source/net/yacy/document/parser/xml/ODMetaHandler.java index b1fc5b333..5210cb68f 100644 --- a/source/de/anomic/document/parser/xml/ODMetaHandler.java +++ b/source/net/yacy/document/parser/xml/ODMetaHandler.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.xml; +package net.yacy.document.parser.xml; import org.xml.sax.Attributes; import org.xml.sax.SAXException; diff --git a/source/de/anomic/document/parser/xml/RSSFeed.java b/source/net/yacy/document/parser/xml/RSSFeed.java similarity index 98% rename from source/de/anomic/document/parser/xml/RSSFeed.java rename to source/net/yacy/document/parser/xml/RSSFeed.java index 94111658e..341e0c0c1 100644 --- a/source/de/anomic/document/parser/xml/RSSFeed.java +++ b/source/net/yacy/document/parser/xml/RSSFeed.java @@ -24,14 +24,15 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.xml; +package net.yacy.document.parser.xml; import java.util.HashSet; import java.util.Iterator; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; -import de.anomic.content.RSSMessage; +import net.yacy.document.content.RSSMessage; + public class RSSFeed implements Iterable { diff --git a/source/de/anomic/document/parser/xml/RSSReader.java b/source/net/yacy/document/parser/xml/RSSReader.java similarity index 98% rename from source/de/anomic/document/parser/xml/RSSReader.java rename to source/net/yacy/document/parser/xml/RSSReader.java index 8a69d464b..b2fb1a3fd 100644 --- a/source/de/anomic/document/parser/xml/RSSReader.java +++ b/source/net/yacy/document/parser/xml/RSSReader.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.xml; +package net.yacy.document.parser.xml; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -34,6 +34,7 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; +import net.yacy.document.content.RSSMessage; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.ByteBuffer; @@ -41,7 +42,6 @@ import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import de.anomic.content.RSSMessage; public class RSSReader extends DefaultHandler { diff --git a/source/de/anomic/document/parser/xml/opensearchdescriptionReader.java b/source/net/yacy/document/parser/xml/opensearchdescriptionReader.java similarity index 99% rename from source/de/anomic/document/parser/xml/opensearchdescriptionReader.java rename to source/net/yacy/document/parser/xml/opensearchdescriptionReader.java index 49f8d5491..36e613ffc 100644 --- a/source/de/anomic/document/parser/xml/opensearchdescriptionReader.java +++ b/source/net/yacy/document/parser/xml/opensearchdescriptionReader.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser.xml; +package net.yacy.document.parser.xml; import java.io.ByteArrayInputStream; import java.io.IOException; diff --git a/source/de/anomic/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java similarity index 94% rename from source/de/anomic/document/parser/zipParser.java rename to source/net/yacy/document/parser/zipParser.java index d0686d89c..1c4ab79e0 100644 --- a/source/de/anomic/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -25,7 +25,7 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.document.parser; +package net.yacy.document.parser; import java.io.BufferedOutputStream; import java.io.File; @@ -41,17 +41,17 @@ import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Idiom; +import net.yacy.document.Parser; +import net.yacy.document.ParserException; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; -import de.anomic.document.AbstractParser; -import de.anomic.document.Idiom; -import de.anomic.document.Parser; -import de.anomic.document.ParserException; -import de.anomic.document.Document; -import de.anomic.document.parser.html.ContentScraper; -import de.anomic.document.parser.html.ImageEntry; public class zipParser extends AbstractParser implements Idiom { diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index e85dfc93a..942f0310a 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -36,6 +36,7 @@ import java.util.Properties; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.index.Row; +import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.order.Digest; @@ -47,7 +48,6 @@ import net.yacy.kelondro.util.MapTools; import de.anomic.crawler.retrieval.Request; import de.anomic.search.QueryParams; -import de.anomic.server.serverCharBuffer; import de.anomic.tools.crypt; public class URIMetadataRow implements URIMetadata { @@ -204,7 +204,7 @@ public class URIMetadataRow implements URIMetadata { } public static byte[] encodeComp(final DigestURI url, final String dc_title, final String dc_creator, final String dc_subject, final String ETag) { - final serverCharBuffer s = new serverCharBuffer(200); + final CharBuffer s = new CharBuffer(200); s.append(url.toNormalform(false, true)).append(10); s.append(dc_title).append(10); s.append(dc_creator).append(10); diff --git a/source/de/anomic/server/serverCachedFileOutputStream.java b/source/net/yacy/kelondro/io/CachedFileOutputStream.java similarity index 89% rename from source/de/anomic/server/serverCachedFileOutputStream.java rename to source/net/yacy/kelondro/io/CachedFileOutputStream.java index 6eda23582..c61f93f8c 100644 --- a/source/de/anomic/server/serverCachedFileOutputStream.java +++ b/source/net/yacy/kelondro/io/CachedFileOutputStream.java @@ -21,7 +21,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.server; +package net.yacy.kelondro.io; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; @@ -37,7 +37,7 @@ import java.io.OutputStream; import net.yacy.kelondro.util.FileUtils; -public class serverCachedFileOutputStream extends ByteArrayOutputStream { +public class CachedFileOutputStream extends ByteArrayOutputStream { protected File fallbackFile; protected long fallbackSize; @@ -47,26 +47,26 @@ public class serverCachedFileOutputStream extends ByteArrayOutputStream { protected boolean isFallback = false; protected OutputStream fallback = null; - public serverCachedFileOutputStream(final long fallbackSize) throws IOException { + public CachedFileOutputStream(final long fallbackSize) throws IOException { this(fallbackSize, null, true, 32); } - public serverCachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered) + public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered) throws IOException { this(fallbackSize, fallback, buffered, 32); } - public serverCachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered, + public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered, final long size) throws IOException { this.fallbackSize = fallbackSize; this.fallbackFile = (fallback == null) ? File.createTempFile( - serverCachedFileOutputStream.class.getName(), + CachedFileOutputStream.class.getName(), Long.toString(System.currentTimeMillis())) : fallback; this.buffered = buffered; checkFallback(size); } - public serverCachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered, + public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered, final byte[] data) throws IOException { this(fallbackSize, fallback, buffered, 0); super.buf = data; diff --git a/source/de/anomic/server/serverCharBuffer.java b/source/net/yacy/kelondro/io/CharBuffer.java similarity index 90% rename from source/de/anomic/server/serverCharBuffer.java rename to source/net/yacy/kelondro/io/CharBuffer.java index 390daa061..fa833bf94 100644 --- a/source/de/anomic/server/serverCharBuffer.java +++ b/source/net/yacy/kelondro/io/CharBuffer.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.server; +package net.yacy.kelondro.io; import java.io.File; import java.io.FileNotFoundException; @@ -32,7 +32,7 @@ import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.Properties; -public final class serverCharBuffer extends Writer { +public final class CharBuffer extends Writer { public static final char singlequote = '\''; public static final char doublequote = '"'; @@ -43,32 +43,32 @@ public final class serverCharBuffer extends Writer { private int length; - public serverCharBuffer() { + public CharBuffer() { buffer = new char[10]; length = 0; offset = 0; } - public serverCharBuffer(final int initLength) { + public CharBuffer(final int initLength) { this.buffer = new char[initLength]; this.length = 0; this.offset = 0; } - public serverCharBuffer(final char[] bb) { + public CharBuffer(final char[] bb) { buffer = bb; length = bb.length; offset = 0; } - public serverCharBuffer(final char[] bb, final int initLength) { + public CharBuffer(final char[] bb, final int initLength) { this.buffer = new char[initLength]; System.arraycopy(bb, 0, buffer, 0, bb.length); length = bb.length; offset = 0; } - public serverCharBuffer(final char[] bb, final int of, final int le) { + public CharBuffer(final char[] bb, final int of, final int le) { if (of * 2 > bb.length) { buffer = new char[le]; System.arraycopy(bb, of, buffer, 0, le); @@ -81,13 +81,13 @@ public final class serverCharBuffer extends Writer { } } - public serverCharBuffer(final serverCharBuffer bb) { + public CharBuffer(final CharBuffer bb) { buffer = bb.buffer; length = bb.length; offset = bb.offset; } - public serverCharBuffer(final File f) throws IOException { + public CharBuffer(final File f) throws IOException { // initially fill the buffer with the content of a file if (f.length() > Integer.MAX_VALUE) throw new IOException("file is too large for buffering"); @@ -157,32 +157,32 @@ public final class serverCharBuffer extends Writer { // return this; // } - public serverCharBuffer append(final int i) { + public CharBuffer append(final int i) { write((char) (i)); return this; } - public serverCharBuffer append(final char[] bb) { + public CharBuffer append(final char[] bb) { write(bb); return this; } - public serverCharBuffer append(final char[] bb, final int of, final int le) { + public CharBuffer append(final char[] bb, final int of, final int le) { write(bb, of, le); return this; } - public serverCharBuffer append(final String s) { + public CharBuffer append(final String s) { return append(s,0,s.length()); } - public serverCharBuffer append(final String s, final int off, final int len) { + public CharBuffer append(final String s, final int off, final int len) { final char[] temp = new char[len]; s.getChars(off, (off + len), temp, 0); return append(temp); } - public serverCharBuffer append(final serverCharBuffer bb) { + public CharBuffer append(final CharBuffer bb) { return append(bb.buffer, bb.offset, bb.length); } @@ -281,7 +281,7 @@ public final class serverCharBuffer extends Writer { } } - public serverCharBuffer trim(final int start) { + public CharBuffer trim(final int start) { // the end value is outside (+1) of the wanted target array if (start > length) throw new IndexOutOfBoundsException("trim: start > length"); offset = offset + start; @@ -289,7 +289,7 @@ public final class serverCharBuffer extends Writer { return this; } - public serverCharBuffer trim(final int start, final int end) { + public CharBuffer trim(final int start, final int end) { // the end value is outside (+1) of the wanted target array if (start > length) throw new IndexOutOfBoundsException("trim: start > length"); if (end > length) throw new IndexOutOfBoundsException("trim: end > length"); @@ -299,7 +299,7 @@ public final class serverCharBuffer extends Writer { return this; } - public serverCharBuffer trim() { + public CharBuffer trim() { int l = 0; while ((l < length) && (buffer[offset + l] <= ' ')) l++; int r = length; diff --git a/source/net/yacy/kelondro/order/MergeIterator.java b/source/net/yacy/kelondro/order/MergeIterator.java index bfb980c0c..0b84b84ec 100644 --- a/source/net/yacy/kelondro/order/MergeIterator.java +++ b/source/net/yacy/kelondro/order/MergeIterator.java @@ -59,6 +59,7 @@ public class MergeIterator implements CloneableIterator { public MergeIterator clone(final Object modifier) { assert a != null; assert b != null; + assert merger != null; return new MergeIterator(a.clone(modifier), b.clone(modifier), comp, merger, up); } @@ -142,6 +143,7 @@ public class MergeIterator implements CloneableIterator { if (!(iiterators.hasNext())) return null; final CloneableIterator one = iiterators.next(); if (!(iiterators.hasNext())) return one; + assert merger != null; return new MergeIterator(one, cascade(iiterators, c, merger, up), c, merger, up); } @@ -158,6 +160,7 @@ public class MergeIterator implements CloneableIterator { System.out.println("Error while initializing simpleMerge (3): " + e.getMessage()); meth = null; } + assert meth != null; simpleMerge = meth; } diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainer.java b/source/net/yacy/kelondro/rwi/ReferenceContainer.java index 41c15147e..20a6374fc 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainer.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainer.java @@ -224,6 +224,7 @@ public class ReferenceContainer extends RowSet System.out.println("Error while initializing containerMerge.NoSuchMethodException: " + e.getMessage()); meth = null; } + assert meth != null; containerMergeMethod = meth; } diff --git a/test/de/anomic/document/ParserTest.java b/test/de/anomic/document/ParserTest.java index 9555d9358..c61554ee4 100644 --- a/test/de/anomic/document/ParserTest.java +++ b/test/de/anomic/document/ParserTest.java @@ -2,6 +2,8 @@ package de.anomic.document; import static org.junit.Assert.*; import static org.junit.matchers.JUnitMatchers.*; +import net.yacy.document.Document; +import net.yacy.document.Parser; import net.yacy.kelondro.data.meta.DigestURI; import org.junit.Test; @@ -11,13 +13,11 @@ import java.io.FileInputStream; import java.io.Reader; import java.io.InputStreamReader; -import de.anomic.document.Document; -import de.anomic.document.Parser; public class ParserTest { @Test public void testParsers() throws java.io.FileNotFoundException, java.lang.InterruptedException, - de.anomic.document.ParserException, java.net.MalformedURLException, + net.yacy.document.ParserException, java.net.MalformedURLException, java.io.UnsupportedEncodingException, java.io.IOException { String[][] testFiles = new String[][] { // meaning: filename in test/parsertest, mimetype, title, creator, description, diff --git a/test/de/anomic/document/parser/htmlParserTest.java b/test/de/anomic/document/parser/htmlParserTest.java index 31c092951..ee7905265 100644 --- a/test/de/anomic/document/parser/htmlParserTest.java +++ b/test/de/anomic/document/parser/htmlParserTest.java @@ -2,6 +2,8 @@ package de.anomic.document.parser; import java.nio.charset.Charset; +import net.yacy.document.parser.htmlParser; + import junit.framework.TestCase; public class htmlParserTest extends TestCase {