From 5e31bad711f2751ceb8cdc90c7d25ba84e788827 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 15 Sep 2013 00:30:23 +0200 Subject: [PATCH] - the webgraph shall store all links which appear on a web page and not all unique links! This made it necessary, that a large portion of the parser and link processing classes must be adopted to carry a different type of link collection which carry a property attribute which are attached to web anchors. - introduction of a new URL class, AnchorURL - the other url classes, DigestURI and MultiProtocolURI had been renamed and refactored to fit into a new document package schema, document.id - cleanup of net.yacy.cora.document package and refactoring --- .gitignore | 1 + htroot/BlacklistTest_p.java | 6 +- htroot/Blacklist_p.java | 6 +- htroot/Blog.java | 2 +- htroot/BlogComments.java | 2 +- htroot/Bookmarks.java | 6 +- htroot/CacheResource_p.java | 6 +- htroot/Collage.java | 6 +- htroot/ConfigAppearance_p.java | 4 +- htroot/ConfigHeuristics_p.java | 2 +- htroot/ConfigLanguage_p.java | 4 +- htroot/ConfigNetwork_p.java | 2 +- htroot/ConfigPortal.java | 5 +- htroot/ConfigUpdate_p.java | 4 +- htroot/CrawlCheck_p.java | 8 +- htroot/CrawlResults.java | 4 +- htroot/CrawlStartScanner_p.java | 10 +- htroot/Crawler_p.java | 40 +-- htroot/DictionaryLoader_p.java | 14 +- htroot/HostBrowser.java | 26 +- htroot/IndexControlRWIs_p.java | 14 +- htroot/IndexControlURLs_p.java | 12 +- htroot/IndexCreateLoaderQueue_p.java | 2 +- htroot/IndexCreateParserErrors_p.java | 6 +- htroot/IndexCreateQueues_p.java | 2 +- htroot/IndexDeletion_p.java | 4 +- htroot/IndexFederated_p.java | 2 +- htroot/IndexImportOAIPMH_p.java | 16 +- htroot/Load_RSS_p.java | 36 +-- htroot/MessageSend_p.java | 2 +- htroot/Messages_p.java | 2 +- htroot/QuickCrawlLink_p.java | 10 +- htroot/ServerScannerList.java | 6 +- htroot/SettingsAck_p.java | 4 +- htroot/Supporter.java | 12 +- htroot/Surftips.java | 10 +- htroot/Table_API_p.java | 2 +- htroot/Table_YMark_p.java | 2 +- htroot/Tables_p.java | 2 +- htroot/Triple_p.java | 4 +- htroot/ViewFile.java | 33 ++- htroot/ViewImage.java | 8 +- htroot/Vocabulary_p.java | 14 +- htroot/WatchWebStructure_p.java | 4 +- htroot/WebStructurePicture_p.java | 6 +- htroot/Wiki.java | 2 +- htroot/YMarks.java | 2 +- htroot/api/bookmarks/posts/delete_p.java | 6 +- htroot/api/citation.java | 28 +-- htroot/api/feed.java | 4 +- htroot/api/getpageinfo.java | 20 +- htroot/api/getpageinfo_p.java | 20 +- htroot/api/table_p.java | 2 +- htroot/api/termlist_p.java | 2 +- htroot/api/webstructure.java | 18 +- htroot/api/yacydoc.java | 6 +- htroot/api/ymarks/add_ymark.java | 4 +- htroot/api/ymarks/get_metadata.java | 4 +- htroot/api/ymarks/get_treeview.java | 6 +- htroot/api/ymarks/get_xbel.java | 2 +- htroot/api/ymarks/get_ymark.java | 2 +- htroot/api/ymarks/import_ymark.java | 2 +- htroot/cytag.java | 5 +- htroot/gsa/searchresult.java | 2 +- htroot/interaction/GetRDF.java | 2 +- htroot/mediawiki_p.java | 2 +- htroot/rct_p.java | 14 +- htroot/sharedBlacklist_p.java | 8 +- htroot/solr/select.java | 2 +- htroot/yacy/crawlReceipt.java | 2 +- htroot/yacy/idx.java | 2 +- htroot/yacy/message.java | 2 +- htroot/yacy/search.java | 10 +- htroot/yacy/transferRWI.java | 6 +- htroot/yacy/transferURL.java | 4 +- htroot/yacy/urls.java | 8 +- htroot/yacysearch.java | 13 +- htroot/yacysearch_location.java | 2 +- htroot/yacysearchitem.java | 20 +- htroot/yacysearchtrailer.java | 4 +- .../contentcontrol/SMWListSyncThread.java | 2 +- .../document/analysis/Classification.java | 10 +- .../cora/document/{ => encoding}/ASCII.java | 2 +- .../cora/document/{ => encoding}/UTF8.java | 2 +- .../cora/document/{ => feed}/Channel.java | 3 +- .../cora/document/{ => feed}/Channels.java | 2 +- .../yacy/cora/document/{ => feed}/Hit.java | 2 +- .../cora/document/{ => feed}/RSSFeed.java | 13 +- .../cora/document/{ => feed}/RSSMessage.java | 5 +- .../cora/document/{ => feed}/RSSReader.java | 5 +- .../net/yacy/cora/document/id/AnchorURL.java | 68 ++++++ .../document/id/DigestURL.java} | 115 ++++----- .../MultiProtocolURL.java} | 45 ++-- .../yacy/cora/document/{ => id}/Punycode.java | 2 +- .../federate/opensearch/SRURSSConnector.java | 16 +- .../federate/solr/SchemaConfiguration.java | 10 +- .../yacy/cora/federate/solr/SolrServlet.java | 5 +- .../solr/connector/AbstractSolrConnector.java | 2 +- .../ConcurrentUpdateSolrConnector.java | 2 +- .../solr/connector/ShardSelection.java | 2 +- .../solr/instance/RemoteInstance.java | 10 +- .../OpensearchResponseWriter.java | 6 +- .../responsewriter/YJsonResponseWriter.java | 8 +- .../yacy/cora/federate/yacy/Distribution.java | 4 +- source/net/yacy/cora/federate/yacy/Peer.java | 2 +- source/net/yacy/cora/federate/yacy/Peers.java | 2 +- source/net/yacy/cora/geo/GeoLocation.java | 2 +- source/net/yacy/cora/lod/JenaTripleStore.java | 2 +- source/net/yacy/cora/lod/Literal.java | 4 +- source/net/yacy/cora/lod/Node.java | 2 +- .../cora/lod/vocabulary/CreativeCommons.java | 20 +- .../cora/lod/vocabulary/YaCyMetadata.java | 2 +- source/net/yacy/cora/order/Base64Order.java | 2 +- source/net/yacy/cora/order/Digest.java | 2 +- source/net/yacy/cora/order/StringOrder.java | 2 +- .../yacy/cora/protocol/HeaderFramework.java | 10 +- .../net/yacy/cora/protocol/RequestHeader.java | 10 +- source/net/yacy/cora/protocol/Scanner.java | 14 +- .../net/yacy/cora/protocol/ftp/FTPClient.java | 2 +- .../yacy/cora/protocol/http/HTTPClient.java | 22 +- .../cora/protocol/http/LinkExtractor.java | 14 +- .../yacy/cora/storage/AbstractMapStore.java | 2 +- source/net/yacy/cora/storage/KeyList.java | 2 +- source/net/yacy/cora/util/ByteArray.java | 2 +- source/net/yacy/cora/util/ByteBuffer.java | 2 +- .../cora/{document => util}/JSONArray.java | 2 +- .../{document => util}/JSONException.java | 2 +- .../cora/{document => util}/JSONObject.java | 2 +- .../cora/{document => util}/JSONTokener.java | 2 +- source/net/yacy/crawler/Balancer.java | 10 +- source/net/yacy/crawler/CrawlQueue.java | 10 +- source/net/yacy/crawler/CrawlStacker.java | 38 ++- source/net/yacy/crawler/CrawlSwitchboard.java | 4 +- source/net/yacy/crawler/data/Cache.java | 6 +- .../net/yacy/crawler/data/CrawlProfile.java | 16 +- source/net/yacy/crawler/data/CrawlQueues.java | 22 +- source/net/yacy/crawler/data/Latency.java | 22 +- .../net/yacy/crawler/data/ResultImages.java | 14 +- source/net/yacy/crawler/data/ZURL.java | 8 +- .../net/yacy/crawler/retrieval/FTPLoader.java | 22 +- .../yacy/crawler/retrieval/FileLoader.java | 14 +- .../yacy/crawler/retrieval/HTTPLoader.java | 14 +- .../net/yacy/crawler/retrieval/RSSLoader.java | 28 +-- .../net/yacy/crawler/retrieval/Request.java | 18 +- .../net/yacy/crawler/retrieval/Response.java | 30 +-- .../net/yacy/crawler/retrieval/SMBLoader.java | 16 +- .../crawler/retrieval/SitemapImporter.java | 12 +- source/net/yacy/crawler/robots/RobotsTxt.java | 22 +- .../yacy/crawler/robots/RobotsTxtEntry.java | 14 +- .../yacy/crawler/robots/RobotsTxtParser.java | 2 +- source/net/yacy/data/BlogBoard.java | 2 +- source/net/yacy/data/BlogBoardComments.java | 2 +- source/net/yacy/data/BookmarkDate.java | 3 +- source/net/yacy/data/BookmarkHelper.java | 25 +- source/net/yacy/data/BookmarksDB.java | 12 +- source/net/yacy/data/MessageBoard.java | 2 +- source/net/yacy/data/URLLicense.java | 6 +- source/net/yacy/data/UserDB.java | 2 +- source/net/yacy/data/WorkTables.java | 8 +- source/net/yacy/data/wiki/WikiBoard.java | 4 +- .../yacy/data/ymark/TablesRowComparator.java | 2 +- .../net/yacy/data/ymark/YMarkAutoTagger.java | 10 +- .../net/yacy/data/ymark/YMarkCrawlStart.java | 6 +- source/net/yacy/data/ymark/YMarkDate.java | 2 +- source/net/yacy/data/ymark/YMarkEntry.java | 4 +- source/net/yacy/data/ymark/YMarkMetadata.java | 12 +- source/net/yacy/data/ymark/YMarkRDF.java | 2 +- source/net/yacy/data/ymark/YMarkTables.java | 8 +- source/net/yacy/data/ymark/YMarkUtil.java | 6 +- source/net/yacy/dbtest.java | 2 +- source/net/yacy/document/Condenser.java | 12 +- source/net/yacy/document/Document.java | 175 +++++++------- source/net/yacy/document/LibraryProvider.java | 4 +- source/net/yacy/document/Parser.java | 14 +- source/net/yacy/document/TextParser.java | 38 +-- source/net/yacy/document/content/DCEntry.java | 14 +- .../yacy/document/content/dao/ImportDump.java | 2 +- .../yacy/document/content/dao/PhpBB3Dao.java | 8 +- .../document/importer/MediawikiImporter.java | 10 +- .../importer/OAIListFriendsLoader.java | 8 +- .../document/importer/OAIPMHImporter.java | 12 +- .../yacy/document/importer/OAIPMHLoader.java | 6 +- .../document/importer/ResumptionToken.java | 16 +- .../yacy/document/parser/audioTagParser.java | 10 +- .../parser/augment/AugmentParser.java | 6 +- .../net/yacy/document/parser/bzipParser.java | 4 +- .../net/yacy/document/parser/csvParser.java | 4 +- .../net/yacy/document/parser/docParser.java | 4 +- .../net/yacy/document/parser/dwgParser.java | 4 +- .../yacy/document/parser/genericParser.java | 8 +- .../net/yacy/document/parser/gzipParser.java | 4 +- .../document/parser/html/ContentScraper.java | 228 ++++++++---------- .../parser/html/ContentTransformer.java | 2 +- .../yacy/document/parser/html/EmbedEntry.java | 8 +- .../yacy/document/parser/html/ImageEntry.java | 63 ++++- .../parser/html/ScraperInputStream.java | 4 +- .../parser/html/TransformerWriter.java | 6 +- .../net/yacy/document/parser/htmlParser.java | 18 +- .../parser/images/genericImageParser.java | 31 +-- source/net/yacy/document/parser/mmParser.java | 6 +- .../net/yacy/document/parser/odtParser.java | 8 +- .../net/yacy/document/parser/ooxmlParser.java | 8 +- .../net/yacy/document/parser/pdfParser.java | 8 +- .../net/yacy/document/parser/pptParser.java | 4 +- source/net/yacy/document/parser/psParser.java | 6 +- .../net/yacy/document/parser/rdfParser.java | 4 +- .../document/parser/rdfa/impl/RDFaParser.java | 12 +- .../net/yacy/document/parser/rssParser.java | 30 ++- .../net/yacy/document/parser/rtfParser.java | 4 +- .../yacy/document/parser/sevenzipParser.java | 10 +- .../yacy/document/parser/sidAudioParser.java | 4 +- .../yacy/document/parser/sitemapParser.java | 16 +- .../net/yacy/document/parser/swfParser.java | 16 +- .../net/yacy/document/parser/tarParser.java | 12 +- .../yacy/document/parser/torrentParser.java | 12 +- .../net/yacy/document/parser/vcfParser.java | 17 +- .../net/yacy/document/parser/vsdParser.java | 4 +- .../net/yacy/document/parser/xlsParser.java | 6 +- .../net/yacy/document/parser/zipParser.java | 6 +- .../yacy/interaction/AugmentHtmlStream.java | 6 +- source/net/yacy/interaction/Interaction.java | 10 +- source/net/yacy/kelondro/blob/ArrayStack.java | 4 +- .../net/yacy/kelondro/blob/BEncodedHeap.java | 4 +- .../yacy/kelondro/blob/BEncodedHeapBag.java | 2 +- .../yacy/kelondro/blob/BEncodedHeapShard.java | 4 +- source/net/yacy/kelondro/blob/Heap.java | 2 +- .../net/yacy/kelondro/blob/HeapModifier.java | 2 +- source/net/yacy/kelondro/blob/HeapReader.java | 4 +- source/net/yacy/kelondro/blob/HeapWriter.java | 2 +- .../yacy/kelondro/blob/MapColumnIndex.java | 2 +- .../net/yacy/kelondro/blob/MapDataMining.java | 2 +- source/net/yacy/kelondro/blob/MapHeap.java | 2 +- source/net/yacy/kelondro/blob/Tables.java | 4 +- .../yacy/kelondro/blob/TablesColumnIndex.java | 2 +- .../data/citation/CitationReference.java | 2 +- .../kelondro/data/meta/URIMetadataNode.java | 15 +- .../kelondro/data/meta/URIMetadataRow.java | 19 +- .../navigation/NavigationReferenceRow.java | 2 +- .../navigation/NavigationReferenceVars.java | 2 +- .../data/word/WordReferenceFactory.java | 2 +- .../kelondro/data/word/WordReferenceRow.java | 2 +- .../kelondro/data/word/WordReferenceVars.java | 8 +- source/net/yacy/kelondro/index/IndexTest.java | 4 +- source/net/yacy/kelondro/index/Row.java | 4 +- .../yacy/kelondro/index/RowCollection.java | 4 +- .../net/yacy/kelondro/index/RowHandleSet.java | 2 +- source/net/yacy/kelondro/index/RowSet.java | 4 +- .../net/yacy/kelondro/io/AbstractWriter.java | 2 +- .../net/yacy/kelondro/io/BufferedRecords.java | 2 +- .../net/yacy/kelondro/io/CachedRecords.java | 2 +- source/net/yacy/kelondro/io/CharBuffer.java | 2 +- source/net/yacy/kelondro/io/Records.java | 2 +- .../yacy/kelondro/rwi/ReferenceContainer.java | 2 +- .../yacy/kelondro/rwi/ReferenceIterator.java | 2 +- source/net/yacy/kelondro/table/Relations.java | 2 +- source/net/yacy/kelondro/table/SQLTable.java | 4 +- source/net/yacy/kelondro/table/Table.java | 2 +- source/net/yacy/kelondro/util/BDecoder.java | 4 +- source/net/yacy/kelondro/util/BEncoder.java | 2 +- source/net/yacy/kelondro/util/FileUtils.java | 2 +- source/net/yacy/kelondro/util/OS.java | 2 +- source/net/yacy/kelondro/util/SetTools.java | 2 +- source/net/yacy/peers/Accessible.java | 2 +- source/net/yacy/peers/DHTSelection.java | 2 +- source/net/yacy/peers/Dispatcher.java | 2 +- source/net/yacy/peers/EventChannel.java | 4 +- source/net/yacy/peers/Network.java | 12 +- source/net/yacy/peers/NewsDB.java | 2 +- source/net/yacy/peers/NewsPool.java | 6 +- source/net/yacy/peers/NewsQueue.java | 2 +- source/net/yacy/peers/PeerActions.java | 4 +- source/net/yacy/peers/Protocol.java | 36 +-- source/net/yacy/peers/RemoteSearch.java | 2 +- source/net/yacy/peers/Seed.java | 4 +- source/net/yacy/peers/SeedDB.java | 10 +- source/net/yacy/peers/Transmission.java | 2 +- .../net/yacy/peers/graphics/NetworkGraph.java | 6 +- source/net/yacy/peers/graphics/OSMTile.java | 6 +- .../peers/graphics/WebStructureGraph.java | 30 +-- .../net/yacy/peers/operation/yacyRelease.java | 23 +- .../peers/operation/yacySeedUploadScp.java | 2 +- .../peers/operation/yacyUpdateLocation.java | 8 +- source/net/yacy/repository/Blacklist.java | 4 +- source/net/yacy/repository/FilterEngine.java | 8 +- .../net/yacy/repository/LoaderDispatcher.java | 36 +-- source/net/yacy/search/Switchboard.java | 118 ++++----- .../net/yacy/search/index/DocumentIndex.java | 24 +- source/net/yacy/search/index/Fulltext.java | 20 +- source/net/yacy/search/index/Segment.java | 53 ++-- .../net/yacy/search/query/AccessTracker.java | 2 +- .../net/yacy/search/query/QueryModifier.java | 4 +- source/net/yacy/search/query/QueryParams.java | 16 +- source/net/yacy/search/query/SearchEvent.java | 29 +-- source/net/yacy/search/ranking/BlockRank.java | 2 +- .../yacy/search/ranking/ReferenceOrder.java | 6 +- .../schema/CollectionConfiguration.java | 83 +++---- .../search/schema/WebgraphConfiguration.java | 32 +-- .../net/yacy/search/snippet/MediaSnippet.java | 28 +-- .../net/yacy/search/snippet/ResultEntry.java | 8 +- .../net/yacy/search/snippet/TextSnippet.java | 6 +- .../yacy/server/http/AugmentedHtmlStream.java | 6 +- .../yacy/server/http/ChunkedOutputStream.java | 4 +- .../yacy/server/http/HTTPDFileHandler.java | 18 +- .../yacy/server/http/HTTPDProxyHandler.java | 22 +- source/net/yacy/server/http/HTTPDemon.java | 8 +- .../yacy/server/http/ServerSideIncludes.java | 2 +- .../net/yacy/server/http/TemplateEngine.java | 4 +- source/net/yacy/server/serverCore.java | 2 +- source/net/yacy/server/serverObjects.java | 8 +- source/net/yacy/utils/bitfield.java | 2 +- source/net/yacy/utils/cryptbig.java | 2 +- source/net/yacy/utils/gzip.java | 2 +- source/net/yacy/utils/loaderThreads.java | 8 +- 313 files changed, 1716 insertions(+), 1632 deletions(-) rename source/net/yacy/cora/document/{ => encoding}/ASCII.java (99%) rename source/net/yacy/cora/document/{ => encoding}/UTF8.java (99%) rename source/net/yacy/cora/document/{ => feed}/Channel.java (97%) rename source/net/yacy/cora/document/{ => feed}/Channels.java (96%) rename source/net/yacy/cora/document/{ => feed}/Hit.java (98%) rename source/net/yacy/cora/document/{ => feed}/RSSFeed.java (92%) rename source/net/yacy/cora/document/{ => feed}/RSSMessage.java (98%) rename source/net/yacy/cora/document/{ => feed}/RSSReader.java (98%) create mode 100644 source/net/yacy/cora/document/id/AnchorURL.java rename source/net/yacy/{kelondro/data/meta/DigestURI.java => cora/document/id/DigestURL.java} (77%) rename source/net/yacy/cora/document/{MultiProtocolURI.java => id/MultiProtocolURL.java} (98%) rename source/net/yacy/cora/document/{ => id}/Punycode.java (99%) rename source/net/yacy/cora/{document => util}/JSONArray.java (99%) rename source/net/yacy/cora/{document => util}/JSONException.java (91%) rename source/net/yacy/cora/{document => util}/JSONObject.java (99%) rename source/net/yacy/cora/{document => util}/JSONTokener.java (99%) diff --git a/.gitignore b/.gitignore index f04e65817..277ddceff 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ RELEASE/ lib/yacy-cora.jar /DATA.bkp /DATA.1 +/gen diff --git a/htroot/BlacklistTest_p.java b/htroot/BlacklistTest_p.java index 3fe6a88b7..429a7de40 100644 --- a/htroot/BlacklistTest_p.java +++ b/htroot/BlacklistTest_p.java @@ -31,8 +31,8 @@ import java.net.MalformedURLException; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; @@ -55,9 +55,9 @@ public class BlacklistTest_p { !urlstring.startsWith("ftp://") && !urlstring.startsWith("smb://") && !urlstring.startsWith("file://")) urlstring = "http://" + urlstring; - DigestURI testurl = null; + DigestURL testurl = null; try { - testurl = new DigestURI(urlstring); + testurl = new DigestURL(urlstring); } catch (final MalformedURLException e) { testurl = null; } diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index ed86da3a0..163225971 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -35,12 +35,12 @@ import java.net.MalformedURLException; import java.util.Arrays; import java.util.List; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.ListManager; import net.yacy.data.WorkTables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist.BlacklistType; @@ -81,9 +81,9 @@ public class Blacklist_p { !urlstring.startsWith("file://")) { urlstring = "http://"+urlstring; } - DigestURI testurl; + DigestURL testurl; try { - testurl = new DigestURI(urlstring); + testurl = new DigestURL(urlstring); } catch (final MalformedURLException e) { testurl = null; } diff --git a/htroot/Blog.java b/htroot/Blog.java index 41d495a57..9ddbef9ee 100644 --- a/htroot/Blog.java +++ b/htroot/Blog.java @@ -38,7 +38,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; diff --git a/htroot/BlogComments.java b/htroot/BlogComments.java index e8650df5b..26406cffc 100644 --- a/htroot/BlogComments.java +++ b/htroot/BlogComments.java @@ -35,7 +35,7 @@ import java.io.PrintWriter; import java.util.Date; import java.util.Iterator; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index e9cb45f06..1c5b9595c 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -39,7 +39,8 @@ import java.util.Map; import java.util.Set; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; @@ -53,7 +54,6 @@ import net.yacy.data.BookmarksDB.Bookmark; import net.yacy.data.BookmarksDB.Tag; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.peers.NewsPool; import net.yacy.search.Switchboard; @@ -247,7 +247,7 @@ public class Bookmarks { try { final File file = new File(post.get("htmlfile")); - BookmarkHelper.importFromBookmarks(sb.bookmarksDB, new DigestURI(file), post.get("htmlfile$file"), tags, isPublic); + BookmarkHelper.importFromBookmarks(sb.bookmarksDB, new DigestURL(file), post.get("htmlfile$file"), tags, isPublic); } catch (final MalformedURLException e) {} ConcurrentLog.info("BOOKMARKS", "success!!"); diff --git a/htroot/CacheResource_p.java b/htroot/CacheResource_p.java index 3cb2ec835..caa95d1ba 100644 --- a/htroot/CacheResource_p.java +++ b/htroot/CacheResource_p.java @@ -24,13 +24,13 @@ import java.net.MalformedURLException; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.Cache; import net.yacy.document.ImageParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; import net.yacy.server.servletProperties; @@ -44,9 +44,9 @@ public class CacheResource_p { if (post == null) return prop; final String u = post.get("url", ""); - DigestURI url; + DigestURL url; try { - url = new DigestURI(u); + url = new DigestURL(u); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); return prop; diff --git a/htroot/Collage.java b/htroot/Collage.java index b32fc1b9c..1ac10e9aa 100644 --- a/htroot/Collage.java +++ b/htroot/Collage.java @@ -24,7 +24,7 @@ import java.util.Random; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.ResultImages; @@ -89,8 +89,8 @@ public class Collage { final int yOffset = embed ? 0 : 70; for (int i = 0; i < fifoSize; i++) { - final MultiProtocolURI baseURL = origins[i].baseURL; - final MultiProtocolURI imageURL = origins[i].imageEntry.url(); + final MultiProtocolURL baseURL = origins[i].baseURL; + final MultiProtocolURL imageURL = origins[i].imageEntry.url(); // check if this loads a page from localhost, which must be prevented to protect the server // against attacks to the administration interface when localhost access is granted diff --git a/htroot/ConfigAppearance_p.java b/htroot/ConfigAppearance_p.java index 83b37ee48..3791f9e8b 100644 --- a/htroot/ConfigAppearance_p.java +++ b/htroot/ConfigAppearance_p.java @@ -39,9 +39,9 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; @@ -101,7 +101,7 @@ public class ConfigAppearance_p { final Iterator it; try { - final DigestURI u = new DigestURI(url); + final DigestURL u = new DigestURL(url); it = FileUtils.strings(u.get(ClientIdentification.yacyInternetCrawlerAgent)); } catch (final IOException e) { prop.put("status", "1");// unable to get URL diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java index 8ea27b15c..0c7fe7062 100644 --- a/htroot/ConfigHeuristics_p.java +++ b/htroot/ConfigHeuristics_p.java @@ -239,7 +239,7 @@ public class ConfigHeuristics_p { // re-read config (and create/update work table) if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) { - OpenSearchConnector os = new OpenSearchConnector(sb, true); + new OpenSearchConnector(sb, true); } } } diff --git a/htroot/ConfigLanguage_p.java b/htroot/ConfigLanguage_p.java index f1103e794..b307d2c9a 100644 --- a/htroot/ConfigLanguage_p.java +++ b/htroot/ConfigLanguage_p.java @@ -39,11 +39,11 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.data.Translator; import net.yacy.data.WorkTables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; @@ -100,7 +100,7 @@ public class ConfigLanguage_p { final String url = post.get("url"); Iterator it; try { - final DigestURI u = new DigestURI(url); + final DigestURL u = new DigestURL(url); it = FileUtils.strings(u.get(ClientIdentification.yacyInternetCrawlerAgent)); } catch(final IOException e) { prop.put("status", "1");//unable to get url diff --git a/htroot/ConfigNetwork_p.java b/htroot/ConfigNetwork_p.java index 731c19457..75ae2f39e 100644 --- a/htroot/ConfigNetwork_p.java +++ b/htroot/ConfigNetwork_p.java @@ -30,7 +30,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.Set; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.protocol.RequestHeader; import net.yacy.data.WorkTables; import net.yacy.kelondro.util.FileUtils; diff --git a/htroot/ConfigPortal.java b/htroot/ConfigPortal.java index 561ada482..9c83ec03e 100644 --- a/htroot/ConfigPortal.java +++ b/htroot/ConfigPortal.java @@ -30,10 +30,11 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Properties; + +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.WorkTables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.server.serverObjects; @@ -98,7 +99,7 @@ public class ConfigPortal { String excludehosts = post.get("search.excludehosts", ""); sb.setConfig("search.excludehosts", excludehosts); - sb.setConfig("search.excludehosth", DigestURI.hosthashes(excludehosts)); + sb.setConfig("search.excludehosth", DigestURL.hosthashes(excludehosts)); } if (post.containsKey("searchpage_default")) { // load defaults from defaults/yacy.init file diff --git a/htroot/ConfigUpdate_p.java b/htroot/ConfigUpdate_p.java index 71bcdd7d5..c3f6a58d0 100644 --- a/htroot/ConfigUpdate_p.java +++ b/htroot/ConfigUpdate_p.java @@ -32,9 +32,9 @@ import java.util.NavigableSet; import java.util.Set; import java.util.TreeSet; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.OS; import net.yacy.peers.operation.yacyBuildProperties; @@ -86,7 +86,7 @@ public class ConfigUpdate_p { final String release = post.get("releasedownload", ""); if (!release.isEmpty()) { try { - yacyRelease versionToDownload = new yacyRelease(new DigestURI(release)); + yacyRelease versionToDownload = new yacyRelease(new DigestURL(release)); // replace this version with version which contains public key final yacyRelease.DevAndMainVersions allReleases = yacyRelease.allReleases(false, false); diff --git a/htroot/CrawlCheck_p.java b/htroot/CrawlCheck_p.java index c6c660dd2..09f342b94 100644 --- a/htroot/CrawlCheck_p.java +++ b/htroot/CrawlCheck_p.java @@ -24,6 +24,7 @@ import java.util.HashSet; import java.util.Set; import java.util.regex.Pattern; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; @@ -31,7 +32,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.robots.RobotsTxtEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; @@ -49,7 +49,7 @@ public class CrawlCheck_p { if (post.containsKey("crawlcheck")) { // get the list of rootURls for this crawl start - Set rootURLs = new HashSet(); + Set rootURLs = new HashSet(); String crawlingStart0 = post.get("crawlingURLs","").trim(); String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); for (String crawlingStart: rootURLs0) { @@ -61,7 +61,7 @@ public class CrawlCheck_p { if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; } try { - DigestURI crawlingStartURL = new DigestURI(crawlingStart); + DigestURL crawlingStartURL = new DigestURL(crawlingStart); rootURLs.add(crawlingStartURL); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); @@ -78,7 +78,7 @@ public class CrawlCheck_p { // and analyze the urls to make the table rows StringBuilder s = new StringBuilder(300); int row = 0; - for (DigestURI u: rootURLs) { + for (DigestURL u: rootURLs) { s.append(u.toNormalform(true)).append('\n'); prop.put("table_list_" + row + "_url", u.toNormalform(true)); diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index c8fb7f258..0ee465dbf 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -31,8 +31,8 @@ import java.util.Iterator; import java.util.Locale; import java.util.Map; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.ResultURLs; diff --git a/htroot/CrawlStartScanner_p.java b/htroot/CrawlStartScanner_p.java index f3e2942c2..400824c19 100644 --- a/htroot/CrawlStartScanner_p.java +++ b/htroot/CrawlStartScanner_p.java @@ -31,6 +31,7 @@ import java.util.Set; import java.util.TreeMap; import java.util.regex.Pattern; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Domains; @@ -40,7 +41,6 @@ import net.yacy.cora.protocol.Scanner.Access; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.WorkTables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.query.SearchEventCache; @@ -193,10 +193,10 @@ public class CrawlStartScanner_p if ( post.containsKey("crawl") ) { // make a pk/url mapping final Iterator> se = Scanner.scancacheEntries(); - final Map pkmap = new TreeMap(Base64Order.enhancedCoder); + final Map pkmap = new TreeMap(Base64Order.enhancedCoder); while (se.hasNext()) { final Scanner.Service u = se.next().getKey(); - DigestURI uu; + DigestURL uu; try { uu = u.url(); pkmap.put(uu.hash(), uu); @@ -208,7 +208,7 @@ public class CrawlStartScanner_p for ( final Map.Entry entry : post.entrySet() ) { if ( entry.getValue().startsWith("mark_") ) { final byte[] pk = entry.getValue().substring(5).getBytes(); - final DigestURI url = pkmap.get(pk); + final DigestURL url = pkmap.get(pk); if ( url != null ) { String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99&directDocByURL=off"; path += "&crawlingURL=" + url.toNormalform(true); @@ -244,7 +244,7 @@ public class CrawlStartScanner_p final Map apiCommentCache = WorkTables.commentCache(sb); String urlString; - DigestURI u; + DigestURL u; try { final Iterator> se = Scanner.scancacheEntries(); Map.Entry host; diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 0bcde88ad..e972b5fa0 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -30,12 +30,13 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; @@ -50,7 +51,6 @@ import net.yacy.data.WorkTables; import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; import net.yacy.peers.NewsPool; @@ -175,7 +175,7 @@ public class Crawler_p { String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); - Set rootURLs = new HashSet(); + Set rootURLs = new HashSet(); String crawlName = ""; if (crawlingFile == null) for (String crawlingStart: rootURLs0) { if (crawlingStart == null || crawlingStart.length() == 0) continue; @@ -185,7 +185,7 @@ public class Crawler_p { if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; else crawlingStart = "http://" + crawlingStart; } try { - DigestURI crawlingStartURL = new DigestURI(crawlingStart); + DigestURL crawlingStartURL = new DigestURL(crawlingStart); rootURLs.add(crawlingStartURL); crawlName += ((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()) + ','; if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false; @@ -288,14 +288,14 @@ public class Crawler_p { if ("sitelist".equals(crawlingMode)) { newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING; - Set newRootURLs = new HashSet(); - for (DigestURI sitelistURL: rootURLs) { + Set newRootURLs = new HashSet(); + for (DigestURL sitelistURL: rootURLs) { // download document Document scraper; try { scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, agent); // get links and generate filter - for (DigestURI u: scraper.getAnchors().keySet()) { + for (DigestURL u: scraper.getAnchors()) { newRootURLs.add(u); } } catch (final IOException e) { @@ -313,14 +313,14 @@ public class Crawler_p { if (fullDomain) { siteFilter = CrawlProfile.siteFilter(rootURLs); if (deleteold) { - for (DigestURI u: rootURLs) { + for (DigestURL u: rootURLs) { sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate); } } } else if (subPath) { siteFilter = CrawlProfile.subpathFilter(rootURLs); if (deleteold) { - for (DigestURI u: rootURLs) { + for (DigestURL u: rootURLs) { String basepath = u.toNormalform(true); if (!basepath.endsWith("/")) {int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);} int count = sb.index.fulltext().remove(basepath, deleteageDate); @@ -339,7 +339,7 @@ public class Crawler_p { // check if the crawl filter works correctly try { Pattern mmp = Pattern.compile(newcrawlingMustMatch); - for (DigestURI u: rootURLs) { + for (DigestURL u: rootURLs) { assert mmp.matcher(u.toNormalform(true)).matches() : "pattern " + mmp.toString() + " does not match url " + u.toNormalform(true); } } catch (final PatternSyntaxException e) { @@ -389,7 +389,7 @@ public class Crawler_p { // delete all error urls for that domain List hosthashes = new ArrayList(); - for (DigestURI u: rootURLs) { + for (DigestURL u: rootURLs) { hosthashes.add(ASCII.getBytes(u.hosthash())); } sb.crawlQueues.errorURL.removeHosts(hosthashes, false); @@ -411,8 +411,8 @@ public class Crawler_p { // stack requests sb.crawler.putActive(handle, profile); - final Set successurls = new HashSet(); - final Map failurls = new HashMap(); + final Set successurls = new HashSet(); + final Map failurls = new HashMap(); sb.stackURLs(rootURLs, profile, successurls, failurls); if (failurls.size() == 0) { @@ -439,7 +439,7 @@ public class Crawler_p { } } else { StringBuilder fr = new StringBuilder(); - for (Map.Entry failure: failurls.entrySet()) { + for (Map.Entry failure: failurls.entrySet()) { sb.crawlQueues.errorURL.push( new Request( sb.peers.mySeed().hash.getBytes(), @@ -470,7 +470,7 @@ public class Crawler_p { } else if ("sitemap".equals(crawlingMode)) { final String sitemapURLStr = post.get("sitemapURL",""); try { - final DigestURI sitemapURL = new DigestURI(sitemapURLStr); + final DigestURL sitemapURL = new DigestURL(sitemapURLStr); sb.crawler.putActive(handle, profile); final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile); importer.start(); @@ -488,7 +488,7 @@ public class Crawler_p { try { // check if the crawl filter works correctly Pattern.compile(newcrawlingMustMatch); - final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile), 10000000); + final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000); final Writer writer = new TransformerWriter(null, null, scraper, null, false); if (crawlingFile != null && crawlingFile.exists()) { FileUtils.copy(new FileInputStream(crawlingFile), writer); @@ -498,12 +498,12 @@ public class Crawler_p { writer.close(); // get links and generate filter - final Map hyperlinks = scraper.getAnchors(); + final List hyperlinks = scraper.getAnchors(); if (newcrawlingdepth > 0) { if (fullDomain) { - newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks.keySet()); + newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks); } else if (subPath) { - newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks.keySet()); + newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks); } } diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index 2aab57113..03c79d4a5 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.net.MalformedURLException; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeonamesLocation; import net.yacy.cora.geo.OpenGeoDBLocation; @@ -29,7 +30,6 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Response; import net.yacy.document.LibraryProvider; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; @@ -66,7 +66,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); + final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1)); @@ -108,7 +108,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon1Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); + final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1)); @@ -150,7 +150,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon2Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); + final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000)); @@ -192,7 +192,7 @@ public class DictionaryLoader_p { if (post.containsKey("geo1Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); + final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file()); LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname); @@ -235,7 +235,7 @@ public class DictionaryLoader_p { if (post.containsKey("drw0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); + final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file()); LibraryProvider.activateDeReWo(); @@ -279,7 +279,7 @@ public class DictionaryLoader_p { if (post.containsKey("pnd0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); + final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file()); LibraryProvider.activatePND(); diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index dd2f347cd..4b4df0c22 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -33,9 +33,10 @@ import java.util.concurrent.BlockingQueue; import org.apache.solr.common.SolrDocument; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailType; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.protocol.RequestHeader; @@ -46,7 +47,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.peers.graphics.WebStructureGraph.StructureEntry; import net.yacy.search.Switchboard; @@ -115,8 +115,8 @@ public class HostBrowser { prop.putHTML("path", path); prop.put("delete", admin && path.length() > 0 ? 1 : 0); - DigestURI pathURI = null; - try {pathURI = new DigestURI(path);} catch (final MalformedURLException e) {} + DigestURL pathURI = null; + try {pathURI = new DigestURL(path);} catch (final MalformedURLException e) {} String load = post.get("load", ""); boolean wait = false; @@ -127,10 +127,10 @@ public class HostBrowser { } if (load.length() > 0 && loadRight) { // stack URL - DigestURI url; + DigestURL url; if (sb.crawlStacker.size() > 2) wait = false; try { - url = new DigestURI(load); + url = new DigestURL(load); String reasonString = sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash.getBytes(), url, null, load, new Date(), @@ -244,7 +244,7 @@ public class HostBrowser { } try { // generate file list from path - DigestURI uri = new DigestURI(path); + DigestURL uri = new DigestURL(path); String host = uri.getHost(); prop.putHTML("outbound_host", host); if (admin) prop.putHTML("outbound_admin_host", host); //used for WebStructurePicture_p link @@ -322,7 +322,7 @@ public class HostBrowser { while (links.hasNext()) { u = links.next(); try { - MultiProtocolURI mu = new MultiProtocolURI(u); + MultiProtocolURL mu = new MultiProtocolURL(u); if (mu.getHost() != null) { ReversibleScoreMap lks = outboundHosts.get(mu.getHost()); if (lks == null) { @@ -422,7 +422,7 @@ public class HostBrowser { prop.put("files_list_" + c + "_type", 0); prop.put("files_list_" + c + "_type_url", entry.getKey()); StoreType type = (StoreType) entry.getValue(); - try {uri = new DigestURI(entry.getKey());} catch (final MalformedURLException e) {uri = null;} + try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;} HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash()); boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS); boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED; @@ -541,12 +541,12 @@ public class HostBrowser { // get all urls from the index and store them here for (String id: internalIDs) { if (id.equals(urlhash)) continue; // no self-references - DigestURI u = fulltext.getURL(ASCII.getBytes(id)); + DigestURL u = fulltext.getURL(ASCII.getBytes(id)); if (u != null) references_internal_urls.add(u.toNormalform(true)); } for (String id: externalIDs) { if (id.equals(urlhash)) continue; // no self-references - DigestURI u = fulltext.getURL(ASCII.getBytes(id)); + DigestURL u = fulltext.getURL(ASCII.getBytes(id)); if (u != null) references_external_urls.add(u.toNormalform(true)); } } catch (final IOException e) { diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index b04aa5917..b95d284d7 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -31,8 +31,9 @@ import java.util.Iterator; import java.util.List; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; @@ -42,7 +43,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ListManager; import net.yacy.document.Condenser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -368,7 +368,7 @@ public class IndexControlRWIs_p { if ( post.containsKey("blacklisturls") ) { final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(","); - DigestURI url; + DigestURL url; for ( final byte[] b : urlb ) { try { urlHashes.put(b); @@ -395,7 +395,7 @@ public class IndexControlRWIs_p { } if ( post.containsKey("blacklistdomains") ) { - DigestURI url; + DigestURL url; for ( final byte[] b : urlb ) { try { urlHashes.put(b); @@ -461,7 +461,7 @@ public class IndexControlRWIs_p { prop.put("genUrlList_flags", (flags == null) ? "" : flags.exportB64()); prop.put("genUrlList_lines", maxlines); int i = 0; - DigestURI url; + DigestURL url; URIMetadataNode entry; String us; long rn = -1; @@ -483,7 +483,7 @@ public class IndexControlRWIs_p { prop.put("genUrlList_urlList_" + i + "_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "
" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "
" + us.substring(20)) : us)); prop.putNum("genUrlList_urlList_" + i + "_urlExists_ranking", (entry.ranking() - rn)); - prop.putNum("genUrlList_urlList_" + i + "_urlExists_domlength", DigestURI.domLengthEstimation(entry.hash())); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_domlength", DigestURL.domLengthEstimation(entry.hash())); prop.putNum("genUrlList_urlList_" + i + "_urlExists_tf", 1000.0 * entry.word().termFrequency()); prop.putNum("genUrlList_urlList_" + i + "_urlExists_authority", (theSearch.getOrder() == null) ? -1 : theSearch.getOrder().authority(ASCII.String(entry.hash(), 6, 6))); prop.put("genUrlList_urlList_" + i + "_urlExists_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(entry.word().lastModified()))); @@ -627,7 +627,7 @@ public class IndexControlRWIs_p { filter, false, null, - DigestURI.TLD_any_zone_filter, + DigestURL.TLD_any_zone_filter, "", false, sb.index, diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 9f202d270..1fc8c91d8 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -33,7 +33,8 @@ import java.util.List; import java.util.Map; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.protocol.ClientIdentification; @@ -43,7 +44,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.ResultURLs; import net.yacy.data.WorkTables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.search.Switchboard; @@ -127,7 +127,7 @@ public class IndexControlURLs_p { String urlhash = post.get("urlhash", "").trim(); if (urlhash.isEmpty() && urlstring.length() > 0) { try { - urlhash = ASCII.String(new DigestURI(urlstring).hash()); + urlhash = ASCII.String(new DigestURL(urlstring).hash()); } catch (final MalformedURLException e) { } } @@ -184,7 +184,7 @@ public class IndexControlURLs_p { } if (post.containsKey("urlhashdelete")) { - final DigestURI url = segment.fulltext().getURL(ASCII.getBytes(urlhash)); + final DigestURL url = segment.fulltext().getURL(ASCII.getBytes(urlhash)); if (url == null) { prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { @@ -197,7 +197,7 @@ public class IndexControlURLs_p { if (post.containsKey("urldelete")) { try { - urlhash = ASCII.String((new DigestURI(urlstring)).hash()); + urlhash = ASCII.String((new DigestURL(urlstring)).hash()); } catch (final MalformedURLException e) { urlhash = null; } @@ -211,7 +211,7 @@ public class IndexControlURLs_p { if (post.containsKey("urlstringsearch")) { try { - final DigestURI url = new DigestURI(urlstring); + final DigestURL url = new DigestURL(urlstring); urlhash = ASCII.String(url.hash()); prop.put("urlhash", urlhash); final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); diff --git a/htroot/IndexCreateLoaderQueue_p.java b/htroot/IndexCreateLoaderQueue_p.java index 85b275da9..0d89f8319 100644 --- a/htroot/IndexCreateLoaderQueue_p.java +++ b/htroot/IndexCreateLoaderQueue_p.java @@ -27,7 +27,7 @@ // javac -classpath .:../classes IndexCreate_p.java // if the shell's current path is HTROOT -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.retrieval.Request; import net.yacy.peers.Seed; diff --git a/htroot/IndexCreateParserErrors_p.java b/htroot/IndexCreateParserErrors_p.java index 3133e112c..6a10f44de 100644 --- a/htroot/IndexCreateParserErrors_p.java +++ b/htroot/IndexCreateParserErrors_p.java @@ -26,11 +26,11 @@ import java.util.ArrayList; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.CrawlStacker; import net.yacy.crawler.data.ZURL; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.peers.Seed; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; @@ -72,7 +72,7 @@ public class IndexCreateParserErrors_p { prop.put("rejected_only-latest", "0"); } dark = true; - DigestURI url; + DigestURL url; byte[] initiatorHash, executorHash; Seed initiatorSeed, executorSeed; int j=0; diff --git a/htroot/IndexCreateQueues_p.java b/htroot/IndexCreateQueues_p.java index ba54886fd..75d169d12 100644 --- a/htroot/IndexCreateQueues_p.java +++ b/htroot/IndexCreateQueues_p.java @@ -9,7 +9,7 @@ import java.util.Map; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.CrawlSwitchboard; diff --git a/htroot/IndexDeletion_p.java b/htroot/IndexDeletion_p.java index 966bf664e..ad933298c 100644 --- a/htroot/IndexDeletion_p.java +++ b/htroot/IndexDeletion_p.java @@ -30,12 +30,12 @@ import java.util.regex.Pattern; import org.apache.solr.common.SolrDocument; import net.yacy.cora.date.ISO8601Formatter; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ScoreMap; import net.yacy.data.WorkTables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.search.query.QueryModifier; import net.yacy.search.schema.CollectionSchema; @@ -129,7 +129,7 @@ public class IndexDeletion_p { if (urlStub.startsWith("ftp")) urlStub = "ftp://" + urlStub; else urlStub = "http://" + urlStub; } try { - DigestURI u = new DigestURI(urlStub); + DigestURL u = new DigestURL(urlStub); BlockingQueue dq = defaultConnector.concurrentDocumentsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + u.getHost() + "\"", 0, 100000000, Long.MAX_VALUE, 100, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); SolrDocument doc; try { diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index 86f2928d3..a478183ec 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -26,7 +26,7 @@ import java.util.ArrayList; import org.apache.solr.common.SolrException; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.instance.RemoteInstance; diff --git a/htroot/IndexImportOAIPMH_p.java b/htroot/IndexImportOAIPMH_p.java index 77d3dfb3b..6db80a0c5 100644 --- a/htroot/IndexImportOAIPMH_p.java +++ b/htroot/IndexImportOAIPMH_p.java @@ -31,6 +31,7 @@ import java.util.Random; import java.util.Set; import java.util.TreeSet; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; @@ -38,7 +39,6 @@ import net.yacy.data.WorkTables; import net.yacy.document.importer.OAIPMHImporter; import net.yacy.document.importer.OAIPMHLoader; import net.yacy.document.importer.ResumptionToken; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -59,9 +59,9 @@ public class IndexImportOAIPMH_p { if (post.containsKey("urlstartone")) { String oaipmhurl = post.get("urlstartone"); if (oaipmhurl.indexOf('?',0) < 0) oaipmhurl = oaipmhurl + "?verb=ListRecords&metadataPrefix=oai_dc"; - DigestURI url = null; + DigestURL url = null; try { - url = new DigestURI(oaipmhurl); + url = new DigestURL(oaipmhurl); ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); final OAIPMHLoader r = new OAIPMHLoader(sb.loader, url, sb.surrogatesInPath, agent); final ResumptionToken rt = r.getResumptionToken(); @@ -72,7 +72,7 @@ public class IndexImportOAIPMH_p { // set next default url try { - final DigestURI nexturl = (rt == null) ? null : rt.resumptionURL(); + final DigestURL nexturl = (rt == null) ? null : rt.resumptionURL(); if (rt != null) prop.put("defaulturl", (nexturl == null) ? "" : nexturl.toNormalform(true)); } catch (final MalformedURLException e) { prop.put("defaulturl", e.getMessage()); @@ -94,9 +94,9 @@ public class IndexImportOAIPMH_p { if (post.get("urlstart", "").length() > 0) { final String oaipmhurl = post.get("urlstart", ""); sb.tables.recordAPICall(post, "IndexImportOAIPMH_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "OAI-PMH import for " + oaipmhurl); - DigestURI url = null; + DigestURL url = null; try { - url = new DigestURI(oaipmhurl); + url = new DigestURL(oaipmhurl); ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); final OAIPMHImporter job = new OAIPMHImporter(sb.loader, agent, url); job.start(); @@ -129,12 +129,12 @@ public class IndexImportOAIPMH_p { final Random r = new Random(System.currentTimeMillis()); // start jobs for the sources - DigestURI url = null; + DigestURL url = null; ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); while (!sourceList.isEmpty()) { final String oaipmhurl = sourceList.remove(r.nextInt(sourceList.size())); try { - url = new DigestURI(oaipmhurl); + url = new DigestURL(oaipmhurl); final OAIPMHImporter job = new OAIPMHImporter(sb.loader, agent, url); job.start(); } catch (final MalformedURLException e) { diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index bab9f7922..dae006c19 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -29,12 +29,13 @@ import java.util.List; import java.util.Map; import java.util.regex.Pattern; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Hit; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.RSSMessage; -import net.yacy.cora.document.RSSReader; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.Hit; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.feed.RSSReader; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; @@ -48,7 +49,6 @@ import net.yacy.crawler.retrieval.Response; import net.yacy.data.WorkTables; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.blob.Tables.Row; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionSchema; @@ -169,9 +169,9 @@ public class Load_RSS_p { ConcurrentLog.logException(e); continue; } - DigestURI url = null; + DigestURL url = null; try { - url = new DigestURI(row.get("url", "")); + url = new DigestURL(row.get("url", "")); } catch (final MalformedURLException e) { ConcurrentLog.warn("Load_RSS", "malformed url '" + row.get("url", "") + "': " + e.getMessage()); continue; @@ -203,7 +203,7 @@ public class Load_RSS_p { messageurl = row.get("url", ""); if (messageurl.isEmpty()) continue; // get referrer - final DigestURI referrer = sb.getURL(row.get("referrer", "").getBytes()); + final DigestURL referrer = sb.getURL(row.get("referrer", "").getBytes()); // check if feed is registered in scheduler final byte[] api_pk = row.get("api_pk"); final Row r = api_pk == null ? null : sb.tables.select("api", api_pk); @@ -257,9 +257,9 @@ public class Load_RSS_p { boolean record_api = false; - DigestURI url = null; + DigestURL url = null; try { - url = post.containsKey("url") ? new DigestURI(post.get("url", "")) : null; + url = post.containsKey("url") ? new DigestURL(post.get("url", "")) : null; } catch (final MalformedURLException e) { ConcurrentLog.warn("Load_RSS_p", "url not well-formed: '" + post.get("url", "") + "'"); } @@ -280,12 +280,12 @@ public class Load_RSS_p { // index all selected items: description only if (rss != null && post.containsKey("indexSelectedItemContent")) { final RSSFeed feed = rss.getFeed(); - List list = new ArrayList(); + List list = new ArrayList(); Map messages = new HashMap(); loop: for (final Map.Entry entry: post.entrySet()) { if (entry.getValue().startsWith("mark_")) try { final RSSMessage message = feed.getMessage(entry.getValue().substring(5)); - final DigestURI messageurl = new DigestURI(message.getLink()); + final DigestURL messageurl = new DigestURL(message.getLink()); if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop; messages.put(ASCII.String(messageurl.hash()), message); } catch (final IOException e) { @@ -296,7 +296,7 @@ public class Load_RSS_p { loop: for (final Map.Entry entry: messages.entrySet()) { try { final RSSMessage message = entry.getValue(); - final DigestURI messageurl = new DigestURI(message.getLink()); + final DigestURL messageurl = new DigestURL(message.getLink()); if (existingurls.get(ASCII.String(messageurl.hash())) != null) continue loop; list.add(messageurl); RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date()); @@ -334,10 +334,10 @@ public class Load_RSS_p { prop.putHTML("showitems_ttl", channel == null ? "" : channel.getTTL()); prop.putHTML("showitems_docs", channel == null ? "" : channel.getDocs()); - Map urls = new HashMap(); + Map urls = new HashMap(); for (final Hit item: feed) { try { - final DigestURI messageurl = new DigestURI(item.getLink()); + final DigestURL messageurl = new DigestURL(item.getLink()); urls.put(ASCII.String(messageurl.hash()), messageurl); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); @@ -349,7 +349,7 @@ public class Load_RSS_p { int i = 0; for (final Hit item: feed) { try { - final DigestURI messageurl = new DigestURI(item.getLink()); + final DigestURL messageurl = new DigestURL(item.getLink()); author = item.getAuthor(); if (author == null) author = item.getCopyright(); pubDate = item.getPubDate(); diff --git a/htroot/MessageSend_p.java b/htroot/MessageSend_p.java index bf4dff509..f192dd59e 100644 --- a/htroot/MessageSend_p.java +++ b/htroot/MessageSend_p.java @@ -30,7 +30,7 @@ import java.util.Date; import java.util.Locale; import java.util.Map; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.peers.Protocol; import net.yacy.peers.Seed; diff --git a/htroot/Messages_p.java b/htroot/Messages_p.java index cc5f6d55e..80aeb46ef 100644 --- a/htroot/Messages_p.java +++ b/htroot/Messages_p.java @@ -32,7 +32,7 @@ import java.util.Iterator; import java.util.Locale; import java.util.TreeMap; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.data.MessageBoard; diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 85d47c1ee..159b23cdf 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -32,7 +32,8 @@ import java.net.MalformedURLException; import java.util.Date; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; @@ -41,7 +42,6 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.NumberTools; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.retrieval.Request; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; import net.yacy.server.serverObjects; @@ -112,12 +112,12 @@ public class QuickCrawlLink_p { if (crawlingStart != null) { crawlingStart = crawlingStart.trim(); - try {crawlingStart = new DigestURI(crawlingStart).toNormalform(true);} catch (final MalformedURLException e1) {} + try {crawlingStart = new DigestURL(crawlingStart).toNormalform(true);} catch (final MalformedURLException e1) {} // check if url is proper - DigestURI crawlingStartURL = null; + DigestURL crawlingStartURL = null; try { - crawlingStartURL = new DigestURI(crawlingStart); + crawlingStartURL = new DigestURL(crawlingStart); } catch (final MalformedURLException e) { prop.put("mode_status", "1"); prop.put("mode_code", "1"); diff --git a/htroot/ServerScannerList.java b/htroot/ServerScannerList.java index 919643964..e5dd62a26 100644 --- a/htroot/ServerScannerList.java +++ b/htroot/ServerScannerList.java @@ -23,13 +23,13 @@ import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.Map; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.Scanner; import net.yacy.cora.protocol.Scanner.Access; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.WorkTables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -54,7 +54,7 @@ public class ServerScannerList { // show scancache table prop.put("servertable", 1); String urlString; - DigestURI u; + DigestURL u; table: while (true) { try { int i = 0; diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 46ba26bb2..a916a2a2e 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -35,7 +35,7 @@ import java.util.StringTokenizer; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; import net.yacy.cora.protocol.Domains; @@ -62,7 +62,7 @@ public class SettingsAck_p { final Switchboard sb = (Switchboard) env; // get referer for backlink - final MultiProtocolURI referer = header.referer(); + final MultiProtocolURL referer = header.referer(); prop.put("referer", (referer == null) ? "Settings_p.html" : referer.toNormalform(true)); //if (post == null) System.out.println("POST: NULL"); else System.out.println("POST: " + post.toString()); diff --git a/htroot/Supporter.java b/htroot/Supporter.java index c56a3f62d..391a0c9a5 100644 --- a/htroot/Supporter.java +++ b/htroot/Supporter.java @@ -31,13 +31,13 @@ import java.util.HashMap; import java.util.Iterator; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ScoreMap; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.peers.NewsDB; @@ -129,7 +129,7 @@ public class Supporter { url = row.getPrimaryKeyUTF8().trim(); try { - if (Switchboard.urlBlacklist.isListed(BlacklistType.SURFTIPS, new DigestURI(url, urlhash.getBytes()))) continue; + if (Switchboard.urlBlacklist.isListed(BlacklistType.SURFTIPS, new DigestURL(url, urlhash.getBytes()))) continue; } catch (final MalformedURLException e) { continue; } @@ -247,13 +247,13 @@ public class Supporter { // add/subtract votes and write record if (entry != null) { try { - urlhash = ASCII.String((new DigestURI(url)).hash()); + urlhash = ASCII.String((new DigestURL(url)).hash()); } catch (final MalformedURLException e) { urlhash = null; } if (urlhash == null) try { - urlhash = ASCII.String((new DigestURI("http://" + url)).hash()); + urlhash = ASCII.String((new DigestURL("http://" + url)).hash()); } catch (final MalformedURLException e) { urlhash = null; } diff --git a/htroot/Surftips.java b/htroot/Surftips.java index 48eb86f84..0926a8728 100644 --- a/htroot/Surftips.java +++ b/htroot/Surftips.java @@ -31,12 +31,12 @@ import java.util.HashMap; import java.util.Iterator; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ScoreMap; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.peers.NewsDB; @@ -136,7 +136,7 @@ public class Surftips { url = row.getPrimaryKeyUTF8().trim(); try{ - if(Switchboard.urlBlacklist.isListed(BlacklistType.SURFTIPS ,new DigestURI(url))) + if(Switchboard.urlBlacklist.isListed(BlacklistType.SURFTIPS ,new DigestURL(url))) continue; }catch(final MalformedURLException e){continue;} title = row.getColUTF8(1); @@ -306,13 +306,13 @@ public class Surftips { // add/subtract votes and write record if (entry != null) { try { - urlhash = UTF8.String((new DigestURI(url)).hash()); + urlhash = UTF8.String((new DigestURL(url)).hash()); } catch (final MalformedURLException e) { urlhash = null; } if (urlhash == null) try { - urlhash = UTF8.String((new DigestURI("http://"+url)).hash()); + urlhash = UTF8.String((new DigestURL("http://"+url)).hash()); } catch (final MalformedURLException e) { urlhash = null; } diff --git a/htroot/Table_API_p.java b/htroot/Table_API_p.java index 6b3689343..4533d9bc7 100644 --- a/htroot/Table_API_p.java +++ b/htroot/Table_API_p.java @@ -29,7 +29,7 @@ import java.util.Set; import java.util.TreeSet; import java.util.regex.Pattern; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; diff --git a/htroot/Table_YMark_p.java b/htroot/Table_YMark_p.java index d02d935cd..482d6ca13 100644 --- a/htroot/Table_YMark_p.java +++ b/htroot/Table_YMark_p.java @@ -5,7 +5,7 @@ import java.util.List; import java.util.Map; import java.util.regex.Pattern; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; diff --git a/htroot/Tables_p.java b/htroot/Tables_p.java index ec59a8ad7..3e3aa5c42 100644 --- a/htroot/Tables_p.java +++ b/htroot/Tables_p.java @@ -25,7 +25,7 @@ import java.util.List; import java.util.Map; import java.util.regex.Pattern; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; diff --git a/htroot/Triple_p.java b/htroot/Triple_p.java index 11cee0daf..aea54eb25 100644 --- a/htroot/Triple_p.java +++ b/htroot/Triple_p.java @@ -22,10 +22,10 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.MalformedURLException; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; import net.yacy.server.http.HTTPDemon; @@ -70,7 +70,7 @@ public class Triple_p { for (String s: list.split("\n")) { String newurl = s; try { - DigestURI d = new DigestURI (s); + DigestURL d = new DigestURL (s); if (d.getHost().endsWith(".yacy")) { newurl = d.getProtocol()+"://"+HTTPDemon.getAlternativeResolver().resolve(d.getHost())+d.getPath(); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index a25bf64fd..ef4137111 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -36,9 +36,10 @@ import java.util.Iterator; import java.util.Map; import java.util.Properties; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.lod.vocabulary.YaCyMetadata; @@ -54,7 +55,6 @@ import net.yacy.document.SentenceReader; import net.yacy.document.WordTokenizer; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; @@ -107,7 +107,7 @@ public class ViewFile { final String viewMode = post.get("viewMode","parsed"); prop.put("error_vMode-" + viewMode, "1"); - DigestURI url = null; + DigestURL url = null; String descr = ""; final int wordCount = 0; int size = 0; @@ -127,7 +127,7 @@ public class ViewFile { } // define an url by post parameter - url = new DigestURI(MultiProtocolURI.unescape(urlString)); + url = new DigestURL(MultiProtocolURL.unescape(urlString)); urlHash = ASCII.String(url.hash()); pre = post.getBoolean("pre"); } catch (final MalformedURLException e) {} @@ -185,7 +185,7 @@ public class ViewFile { } final String[] wordArray = wordArray(post.get("words", null)); - final String ext = MultiProtocolURI.getFileExtension(url.getFileName()); + final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); if (viewMode.equals("plain")) { // TODO: how to handle very large files here ? @@ -311,11 +311,11 @@ public class ViewFile { prop.put("viewMode", VIEW_MODE_AS_LINKLIST); boolean dark = true; int i = 0; - i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0), document.getAnchors()); - i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0), document.getAnchors()); + i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0)); + i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0)); dark = (i % 2 == 0); - final Map ts = document.getImages(); + final Map ts = document.getImages(); final Iterator tsi = ts.values().iterator(); ImageEntry entry; while (tsi.hasNext()) { @@ -335,8 +335,8 @@ public class ViewFile { dark = !dark; i++; } - i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0), document.getAnchors()); - i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0), document.getAnchors()); + i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0)); + i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0)); prop.put("viewMode_links", i); } @@ -439,13 +439,12 @@ public class ViewFile { final serverObjects prop, final String[] wordArray, int c, - final Map media, + final Map media, final String type, - boolean dark, - final Map alllinks) { + boolean dark) { int i = 0; - for (final Map.Entry entry : media.entrySet()) { - final Properties p = alllinks.get(entry.getKey()); + for (final Map.Entry entry : media.entrySet()) { + final Properties p = entry.getKey().getProperties(); final String name = p.getProperty("name", ""); // the name attribute final String rel = p.getProperty("rel", ""); // the rel-attribute final String text = p.getProperty("text", ""); // the text between the tag diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index f343a01d2..d0e9b1271 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -32,6 +32,7 @@ import java.io.InputStream; import java.net.MalformedURLException; import java.util.Map; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; @@ -41,7 +42,6 @@ import net.yacy.cora.storage.ConcurrentARC; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.URLLicense; import net.yacy.document.ImageParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.workflow.WorkflowProcessor; @@ -74,9 +74,9 @@ public class ViewImage { final String urlLicense = post.get("code", ""); final boolean auth = Domains.isLocalhost(header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "")) || sb.verifyAuthentication(header); // handle access rights - DigestURI url = null; + DigestURL url = null; if ((urlString.length() > 0) && (auth)) try { - url = new DigestURI(urlString); + url = new DigestURL(urlString); } catch (final MalformedURLException e1) { url = null; } @@ -84,7 +84,7 @@ public class ViewImage { if ((url == null) && (urlLicense.length() > 0)) { urlString = URLLicense.releaseLicense(urlLicense); try { - url = new DigestURI(urlString); + url = new DigestURL(urlString); } catch (final MalformedURLException e1) { url = null; urlString = null; diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index baaeddf87..d660fa10b 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -26,7 +26,8 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeMap; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.lod.vocabulary.DCTerms; import net.yacy.cora.lod.vocabulary.Owl; import net.yacy.cora.lod.vocabulary.Tagging; @@ -35,7 +36,6 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.LibraryProvider; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; @@ -59,8 +59,8 @@ public class Vocabulary_p { // create a vocabulary if (discovername != null && discovername.length() > 0) { String discoverobjectspace = post.get("discoverobjectspace", ""); - MultiProtocolURI discoveruri = null; - if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURI(discoverobjectspace);} catch (final MalformedURLException e) {} + MultiProtocolURL discoveruri = null; + if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {} if (discoveruri == null) discoverobjectspace = ""; Map table = new TreeMap(); File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); @@ -72,9 +72,9 @@ public class Vocabulary_p { Segment segment = sb.index; String t; if (!discoverNot) { - Iterator ui = segment.urlSelector(discoveruri, 600000L, 100000); + Iterator ui = segment.urlSelector(discoveruri, 600000L, 100000); while (ui.hasNext()) { - DigestURI u = ui.next(); + DigestURL u = ui.next(); String u0 = u.toNormalform(true); t = ""; if (discoverFromPath) { @@ -131,7 +131,7 @@ public class Vocabulary_p { if (post.get("add_new", "").equals("checked") && post.get("newterm", "").length() > 0) { String objectlink = post.get("newobjectlink", ""); if (objectlink.length() > 0) try { - objectlink = new MultiProtocolURI(objectlink).toNormalform(true); + objectlink = new MultiProtocolURL(objectlink).toNormalform(true); } catch (final MalformedURLException e) {} vocabulary.put(post.get("newterm", ""), post.get("newsynonyms", ""), objectlink); } diff --git a/htroot/WatchWebStructure_p.java b/htroot/WatchWebStructure_p.java index a376c7b09..9a7fa087a 100644 --- a/htroot/WatchWebStructure_p.java +++ b/htroot/WatchWebStructure_p.java @@ -6,11 +6,11 @@ import java.util.Iterator; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.data.CrawlProfile; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -64,7 +64,7 @@ public class WatchWebStructure_p { // fix start point if a "www."-prefix would be better if (host != null && !host.startsWith("www")) { - if (sb.webStructure.referencesCount(DigestURI.hosthash6("www." + host)) > sb.webStructure.referencesCount(DigestURI.hosthash6(host))) { + if (sb.webStructure.referencesCount(DigestURL.hosthash6("www." + host)) > sb.webStructure.referencesCount(DigestURL.hosthash6(host))) { host = "www." + host; } } diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java index 4be6f047f..ac1f372ac 100644 --- a/htroot/WebStructurePicture_p.java +++ b/htroot/WebStructurePicture_p.java @@ -33,12 +33,12 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.peers.graphics.WebStructureGraph; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; @@ -116,7 +116,7 @@ public class WebStructurePicture_p { for (int i = 0; i < hostlist.length; i++) { String host = hostlist[i]; String hash = null; - try {hash = ASCII.String((new DigestURI("http://" + host)).hash(), 6, 6);} catch (final MalformedURLException e) {ConcurrentLog.logException(e);} + try {hash = ASCII.String((new DigestURL("http://" + host)).hash(), 6, 6);} catch (final MalformedURLException e) {ConcurrentLog.logException(e);} Map.Entry centernode = new AbstractMap.SimpleEntry(hash, host); double angle = 2.0d * i * Math.PI / hostlist.length; if (hostlist.length == 3) angle -= Math.PI / 2; diff --git a/htroot/Wiki.java b/htroot/Wiki.java index 15b52e6bd..a3aa6b1f2 100644 --- a/htroot/Wiki.java +++ b/htroot/Wiki.java @@ -38,7 +38,7 @@ import java.util.Iterator; import java.util.Locale; import java.util.Map; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; diff --git a/htroot/YMarks.java b/htroot/YMarks.java index 7b3b99d1a..1e4efa5c1 100644 --- a/htroot/YMarks.java +++ b/htroot/YMarks.java @@ -1,7 +1,7 @@ import java.io.IOException; import java.util.Iterator; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; diff --git a/htroot/api/bookmarks/posts/delete_p.java b/htroot/api/bookmarks/posts/delete_p.java index 6cb85617d..1e65bf111 100644 --- a/htroot/api/bookmarks/posts/delete_p.java +++ b/htroot/api/bookmarks/posts/delete_p.java @@ -1,9 +1,9 @@ import java.net.MalformedURLException; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -23,7 +23,7 @@ public class delete_p { return prop; } try { - if (post.containsKey("url") && switchboard.bookmarksDB.removeBookmark(ASCII.String((new DigestURI(post.get("url", "nourl"))).hash()))) { + if (post.containsKey("url") && switchboard.bookmarksDB.removeBookmark(ASCII.String((new DigestURL(post.get("url", "nourl"))).hash()))) { prop.put("result", "1"); } else if (post.containsKey("urlhash") && switchboard.bookmarksDB.removeBookmark(post.get("urlhash", "nohash"))) { prop.put("result", "1"); diff --git a/htroot/api/citation.java b/htroot/api/citation.java index e77e44746..2d76f3fc9 100644 --- a/htroot/api/citation.java +++ b/htroot/api/citation.java @@ -30,12 +30,12 @@ import java.util.TreeSet; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.document.SentenceReader; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; import net.yacy.search.schema.CollectionSchema; @@ -56,7 +56,7 @@ public class citation { prop.put("citations", 0); prop.put("sentences", 0); - DigestURI uri = null; + DigestURL uri = null; String url = ""; String hash = ""; int ch = 10; @@ -81,7 +81,7 @@ public class citation { if (url.length() > 0) { try { - uri = new DigestURI(url, null); + uri = new DigestURL(url, null); hash = ASCII.String(uri.hash()); } catch (final MalformedURLException e) {} } @@ -118,7 +118,7 @@ public class citation { // for each line make a statistic about the number of occurrences somewhere else OrderedScoreMap scores = new OrderedScoreMap(null); // accumulates scores for citating urls - LinkedHashMap> sentenceOcc = new LinkedHashMap>(); + LinkedHashMap> sentenceOcc = new LinkedHashMap>(); for (String sentence: sentences) { if (sentence == null || sentence.length() < 40) { // do not count the very short sentences @@ -130,12 +130,12 @@ public class citation { SolrDocumentList doclist = connector.getDocumentListByQuery("text_t:\"" + sentence + "\"", 0, 100, CollectionSchema.sku.getSolrFieldName()); int count = (int) doclist.getNumFound(); if (count > 0) { - Set list = new TreeSet(); + Set list = new TreeSet(); for (SolrDocument d: doclist) { String u = (String) d.getFieldValue(CollectionSchema.sku.getSolrFieldName()); if (u == null || u.equals(url)) continue; scores.inc(u); - try {list.add(new DigestURI(u, null));} catch (final MalformedURLException e) {} + try {list.add(new DigestURL(u, null));} catch (final MalformedURLException e) {} } sentenceOcc.put(sentence, list); } @@ -147,13 +147,13 @@ public class citation { // iterate the sentences int i = 0; - for (Map.Entry> se: sentenceOcc.entrySet()) { + for (Map.Entry> se: sentenceOcc.entrySet()) { prop.put("sentences_" + i + "_dt", i); StringBuilder dd = new StringBuilder(se.getKey()); - Set app = se.getValue(); + Set app = se.getValue(); if (app != null && app.size() > 0) { dd.append("
appears in:"); - for (DigestURI u: app) { + for (DigestURL u: app) { if (u != null) { dd.append(" ").append(u.getHost()).append(""); } @@ -168,12 +168,12 @@ public class citation { i = 0; for (String u: scores.keyList(false)) { try { - DigestURI uu = new DigestURI(u, null); + DigestURL uu = new DigestURL(u, null); prop.put("citations_" + i + "_dt", "" + u + ""); StringBuilder dd = new StringBuilder(); dd.append("makes ").append(Integer.toString(scores.get(u))).append(" citations: of ").append(url); - for (Map.Entry> se: sentenceOcc.entrySet()) { - Set occurls = se.getValue(); + for (Map.Entry> se: sentenceOcc.entrySet()) { + Set occurls = se.getValue(); if (occurls != null && occurls.contains(uu)) dd.append("
").append(se.getKey()).append(""); } prop.put("citations_" + i + "_dd", dd.toString()); @@ -187,7 +187,7 @@ public class citation { for (String u: scores.keyList(false)) { if (scores.get(u) < ch) continue; try { - DigestURI uu = new DigestURI(u, null); + DigestURL uu = new DigestURL(u, null); if (uu.getOrganization().equals(uri.getOrganization())) continue; prop.put("similar_links_" + i + "_url", u); i++; diff --git a/htroot/api/feed.java b/htroot/api/feed.java index 84bc5e49a..e4b5b7f9c 100644 --- a/htroot/api/feed.java +++ b/htroot/api/feed.java @@ -3,8 +3,8 @@ import java.util.Date; import java.util.List; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.protocol.RequestHeader; import net.yacy.peers.EventChannel; import net.yacy.search.Switchboard; diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index e2d41313d..eb6cd052b 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -26,19 +26,21 @@ import java.io.IOException; import java.net.MalformedURLException; +import java.util.Collection; import java.util.Set; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.robots.RobotsTxtEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; @@ -90,9 +92,9 @@ public class getpageinfo { url = "http://" + url; } if (actions.indexOf("title",0) >= 0) { - DigestURI u = null; + DigestURL u = null; try { - u = new DigestURI(url); + u = new DigestURL(url); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); } @@ -129,11 +131,11 @@ public class getpageinfo { prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next()); // get links and put them into a semicolon-separated list - final Set uris = scraper.getAnchors().keySet(); + final Collection uris = scraper.getAnchors(); final StringBuilder links = new StringBuilder(uris.size() * 80); final StringBuilder filter = new StringBuilder(uris.size() * 40); count = 0; - for (final DigestURI uri: uris) { + for (final DigestURL uri: uris) { if (uri == null) continue; links.append(';').append(uri.toNormalform(true)); filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); @@ -147,7 +149,7 @@ public class getpageinfo { } if (actions.indexOf("robots",0) >= 0) { try { - final DigestURI theURL = new DigestURI(url); + final DigestURL theURL = new DigestURL(url); // determine if crawling of the current URL is allowed RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent); @@ -155,7 +157,7 @@ public class getpageinfo { prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo()); // get the sitemap URL of the domain - final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap(); + final MultiProtocolURL sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap(); prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString()); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); @@ -163,7 +165,7 @@ public class getpageinfo { } if (actions.indexOf("oai",0) >= 0) { try { - final DigestURI theURL = new DigestURI(url + final DigestURL theURL = new DigestURL(url + "?verb=Identify"); final String oairesult = checkOAI(theURL.toString()); diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index 7b35f7e43..6981d9397 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -26,19 +26,21 @@ import java.io.IOException; import java.net.MalformedURLException; +import java.util.Collection; import java.util.Set; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.robots.RobotsTxtEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; @@ -88,9 +90,9 @@ public class getpageinfo_p { url = "http://" + url; } if (actions.indexOf("title",0) >= 0) { - DigestURI u = null; + DigestURL u = null; try { - u = new DigestURI(url); + u = new DigestURL(url); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); } @@ -128,11 +130,11 @@ public class getpageinfo_p { prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next()); // get links and put them into a semicolon-separated list - final Set uris = scraper.getAnchors().keySet(); + final Collection uris = scraper.getAnchors(); final StringBuilder links = new StringBuilder(uris.size() * 80); final StringBuilder filter = new StringBuilder(uris.size() * 40); count = 0; - for (final DigestURI uri: uris) { + for (final DigestURL uri: uris) { if (uri == null) continue; links.append(';').append(uri.toNormalform(true)); filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); @@ -146,7 +148,7 @@ public class getpageinfo_p { } if (actions.indexOf("robots",0) >= 0) { try { - final DigestURI theURL = new DigestURI(url); + final DigestURL theURL = new DigestURL(url); // determine if crawling of the current URL is allowed ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); @@ -156,7 +158,7 @@ public class getpageinfo_p { prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo()); // get the sitemap URL of the domain - final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap(); + final MultiProtocolURL sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap(); prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString()); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); @@ -164,7 +166,7 @@ public class getpageinfo_p { } if (actions.indexOf("oai",0) >= 0) { try { - final DigestURI theURL = new DigestURI(url + final DigestURL theURL = new DigestURL(url + "?verb=Identify"); final String oairesult = checkOAI(theURL.toString()); diff --git a/htroot/api/table_p.java b/htroot/api/table_p.java index 819172fd4..2df19a36d 100644 --- a/htroot/api/table_p.java +++ b/htroot/api/table_p.java @@ -24,7 +24,7 @@ import java.util.Iterator; import java.util.Map; import java.util.regex.Pattern; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; diff --git a/htroot/api/termlist_p.java b/htroot/api/termlist_p.java index 6a342c8fb..4d420a650 100644 --- a/htroot/api/termlist_p.java +++ b/htroot/api/termlist_p.java @@ -25,7 +25,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.Rating; import net.yacy.cora.util.ConcurrentLog; diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 94731fade..e8a9efde4 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -29,14 +29,14 @@ import java.util.Iterator; import java.util.Map; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.data.citation.CitationReference; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.peers.graphics.WebStructureGraph; @@ -56,7 +56,7 @@ public class webstructure { prop.put("citations", 0); boolean authenticated = sb.adminAuthenticated(header) >= 2; if (about != null) { - DigestURI url = null; + DigestURL url = null; byte[] urlhash = null; String hosthash = null; if (about.length() == 6 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) { @@ -68,7 +68,7 @@ public class webstructure { } else if (authenticated && about.length() > 0) { // consider "about" as url or hostname try { - url = new DigestURI(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains + url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains urlhash = url.hash(); hosthash = ASCII.String(urlhash, 6, 6); } catch (final MalformedURLException e) { @@ -111,18 +111,18 @@ public class webstructure { prop.put("references_documents_0_urle", url == null ? 0 : 1); if (url != null) prop.putXML("references_documents_0_urle_url", url.toNormalform(true)); int d = 0; - Iterator i = scraper.inboundLinks().iterator(); + Iterator i = scraper.inboundLinks().keySet().iterator(); while (i.hasNext()) { - DigestURI refurl = i.next(); + DigestURL refurl = i.next(); byte[] refhash = refurl.hash(); prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true)); prop.put("references_documents_0_anchors_" + d + "_hash", refhash); prop.put("references_documents_0_anchors_" + d + "_outbound", 0); d++; } - i = scraper.outboundLinks().iterator(); + i = scraper.outboundLinks().keySet().iterator(); while (i.hasNext()) { - DigestURI refurl = i.next(); + DigestURL refurl = i.next(); byte[] refhash = refurl.hash(); prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true)); prop.put("references_documents_0_anchors_" + d + "_hash", refhash); @@ -158,7 +158,7 @@ public class webstructure { while (i.hasNext()) { CitationReference cr = i.next(); byte[] refhash = cr.urlhash(); - DigestURI refurl = authenticated ? sb.getURL(refhash) : null; + DigestURL refurl = authenticated ? sb.getURL(refhash) : null; prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1); if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true)); prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash); diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index 2470049fb..e394fe1a1 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -30,13 +30,13 @@ import java.util.Arrays; import java.util.Iterator; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader.FileType; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.search.Switchboard; @@ -90,7 +90,7 @@ public class yacydoc { if (urlstring.length() > 0 && urlhash.isEmpty()) { try { - final DigestURI url = new DigestURI(urlstring); + final DigestURL url = new DigestURL(urlstring); urlhash = ASCII.String(url.hash()); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); diff --git a/htroot/api/ymarks/add_ymark.java b/htroot/api/ymarks/add_ymark.java index c6ce581c7..b001d0a59 100644 --- a/htroot/api/ymarks/add_ymark.java +++ b/htroot/api/ymarks/add_ymark.java @@ -1,5 +1,6 @@ import java.io.IOException; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; @@ -8,7 +9,6 @@ import net.yacy.data.ymark.YMarkEntry; import net.yacy.data.ymark.YMarkTables; import net.yacy.data.ymark.YMarkUtil; import net.yacy.document.Parser.Failure; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -34,7 +34,7 @@ public class add_ymark { if (post.containsKey("urlHash")) { final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING); - final DigestURI url = sb.index.fulltext().getURL(urlHash.getBytes()); + final DigestURL url = sb.index.fulltext().getURL(urlHash.getBytes()); final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt()); final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING); try { diff --git a/htroot/api/ymarks/get_metadata.java b/htroot/api/ymarks/get_metadata.java index aaae81ecc..c161a1e1e 100644 --- a/htroot/api/ymarks/get_metadata.java +++ b/htroot/api/ymarks/get_metadata.java @@ -4,6 +4,7 @@ import java.util.EnumMap; import java.util.Iterator; import java.util.regex.Pattern; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; @@ -16,7 +17,6 @@ import net.yacy.data.ymark.YMarkTables; import net.yacy.data.ymark.YMarkUtil; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -49,7 +49,7 @@ public class get_metadata { } try { - final YMarkMetadata meta = new YMarkMetadata(new DigestURI(url), sb.index); + final YMarkMetadata meta = new YMarkMetadata(new DigestURL(url), sb.index); ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); final Document document = meta.loadDocument(sb.loader, agent); final EnumMap metadata = meta.loadMetadata(); diff --git a/htroot/api/ymarks/get_treeview.java b/htroot/api/ymarks/get_treeview.java index 5ef79a4e6..300cc3a15 100644 --- a/htroot/api/ymarks/get_treeview.java +++ b/htroot/api/ymarks/get_treeview.java @@ -7,7 +7,8 @@ import java.util.TreeMap; import java.util.regex.Pattern; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; @@ -23,7 +24,6 @@ import net.yacy.data.ymark.YMarkUtil; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -212,7 +212,7 @@ public class get_treeview { } } else if (isAutoTagger || isMetadata || isURLdb || isCrawlStart) { try { - final YMarkMetadata meta = new YMarkMetadata(new DigestURI(post.get(ROOT).substring(2)), sb.index); + final YMarkMetadata meta = new YMarkMetadata(new DigestURL(post.get(ROOT).substring(2)), sb.index); ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); final Document document = meta.loadDocument(sb.loader, agent); final TreeMap tags = sb.tables.bookmarks.getTags(bmk_user); diff --git a/htroot/api/ymarks/get_xbel.java b/htroot/api/ymarks/get_xbel.java index 3ed5c6e05..c47809546 100644 --- a/htroot/api/ymarks/get_xbel.java +++ b/htroot/api/ymarks/get_xbel.java @@ -2,7 +2,7 @@ import java.io.IOException; import java.util.HashSet; import java.util.Iterator; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.UserDB; diff --git a/htroot/api/ymarks/get_ymark.java b/htroot/api/ymarks/get_ymark.java index 5560f2010..e2d2956cb 100644 --- a/htroot/api/ymarks/get_ymark.java +++ b/htroot/api/ymarks/get_ymark.java @@ -3,7 +3,7 @@ import java.util.Collection; import java.util.Iterator; import java.util.regex.Pattern; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.UserDB; diff --git a/htroot/api/ymarks/import_ymark.java b/htroot/api/ymarks/import_ymark.java index 8a21f8070..d10e698c3 100644 --- a/htroot/api/ymarks/import_ymark.java +++ b/htroot/api/ymarks/import_ymark.java @@ -12,7 +12,7 @@ import java.util.concurrent.ArrayBlockingQueue; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; diff --git a/htroot/cytag.java b/htroot/cytag.java index 5d2954413..d3b6eaded 100644 --- a/htroot/cytag.java +++ b/htroot/cytag.java @@ -28,8 +28,9 @@ import java.awt.Image; import java.io.File; import java.io.IOException; + import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.ImageParser; @@ -45,7 +46,7 @@ public class cytag { public static Image respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final Switchboard sb = (Switchboard)env; - final MultiProtocolURI referer = header.referer(); + final MultiProtocolURL referer = header.referer(); // harvest request information StringBuilder connect = new StringBuilder(); diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java index 6ccf143ee..1a145cf94 100644 --- a/htroot/gsa/searchresult.java +++ b/htroot/gsa/searchresult.java @@ -25,7 +25,7 @@ import java.io.Writer; import java.util.Date; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter; diff --git a/htroot/interaction/GetRDF.java b/htroot/interaction/GetRDF.java index 4c704c7df..679770db4 100644 --- a/htroot/interaction/GetRDF.java +++ b/htroot/interaction/GetRDF.java @@ -32,7 +32,7 @@ package interaction; import java.io.ByteArrayOutputStream; import java.io.UnsupportedEncodingException; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.protocol.RequestHeader; import net.yacy.interaction.Interaction; diff --git a/htroot/mediawiki_p.java b/htroot/mediawiki_p.java index e2d82500b..3a6bf3864 100644 --- a/htroot/mediawiki_p.java +++ b/htroot/mediawiki_p.java @@ -27,7 +27,7 @@ import java.io.File; import java.io.IOException; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.importer.MediawikiImporter; import net.yacy.search.Switchboard; diff --git a/htroot/rct_p.java b/htroot/rct_p.java index b8c7efb66..8c05077d5 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -29,11 +29,11 @@ import java.net.MalformedURLException; import java.util.Date; import java.util.Iterator; -import net.yacy.cora.document.Hit; -import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.feed.Hit; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.retrieval.Request; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.peers.DHTSelection; import net.yacy.peers.Protocol; import net.yacy.peers.Seed; @@ -58,15 +58,15 @@ public class rct_p { //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate()); // put url on remote crawl stack - DigestURI url; + DigestURL url; try { - url = new DigestURI(item.getLink()); + url = new DigestURL(item.getLink()); } catch (final MalformedURLException e) { url = null; } Date loaddate; loaddate = item.getPubDate(); - final DigestURI referrer = null; // referrer needed! + final DigestURL referrer = null; // referrer needed! final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(url); if (urlRejectReason == null) { // stack url @@ -101,7 +101,7 @@ public class rct_p { * @param url * @return */ - private static String urlToString(final DigestURI url) { + private static String urlToString(final DigestURL url) { return (url == null ? "null" : url.toNormalform(true)); } diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index 86430b980..43c56b6b0 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -38,14 +38,14 @@ import java.util.Iterator; import java.util.List; import java.util.Set; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.data.ListManager; import net.yacy.data.list.ListAccumulator; import net.yacy.data.list.XMLBlacklistImporter; import net.yacy.document.parser.html.CharacterCoding; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.peers.Seed; import net.yacy.repository.Blacklist; @@ -137,7 +137,7 @@ public class sharedBlacklist_p { // download the blacklist try { // get List - final DigestURI u = new DigestURI(downloadURLOld); + final DigestURL u = new DigestURL(downloadURLOld); otherBlacklist = FileUtils.strings(u.get(agent)); } catch (final Exception e) { @@ -155,7 +155,7 @@ public class sharedBlacklist_p { prop.putHTML("page_source", downloadURL); try { - final DigestURI u = new DigestURI(downloadURL); + final DigestURL u = new DigestURL(downloadURL); otherBlacklist = FileUtils.strings(u.get(agent)); } catch (final Exception e) { prop.put("status", STATUS_URL_PROBLEM); diff --git a/htroot/solr/select.java b/htroot/solr/select.java index 3d5a1e402..69df3e110 100644 --- a/htroot/solr/select.java +++ b/htroot/solr/select.java @@ -27,7 +27,7 @@ import java.util.Map; import javax.servlet.ServletException; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.SolrServlet; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index d83aa38d6..a3a3318e6 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -29,7 +29,7 @@ import java.io.IOException; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.ResultURLs; diff --git a/htroot/yacy/idx.java b/htroot/yacy/idx.java index e5c1afd04..7662a6b08 100644 --- a/htroot/yacy/idx.java +++ b/htroot/yacy/idx.java @@ -24,7 +24,7 @@ import java.util.Iterator; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceContainerCache; diff --git a/htroot/yacy/message.java b/htroot/yacy/message.java index 83cbde532..3544c4186 100644 --- a/htroot/yacy/message.java +++ b/htroot/yacy/message.java @@ -35,7 +35,7 @@ import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 456b4a898..77316289d 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -36,10 +36,11 @@ import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; -import net.yacy.cora.document.ASCII; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; -import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; @@ -50,7 +51,6 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -241,7 +241,7 @@ public final class search { null, false, null, - DigestURI.TLD_any_zone_filter, + DigestURL.TLD_any_zone_filter, client, false, indexSegment, @@ -305,7 +305,7 @@ public final class search { constraint, false, null, - DigestURI.TLD_any_zone_filter, + DigestURL.TLD_any_zone_filter, client, false, sb.index, diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 436ed9ddc..bd6f18e67 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -32,9 +32,9 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Set; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.RSSMessage; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.federate.yacy.Distribution; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index a010a64b8..f28fe9137 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -33,8 +33,8 @@ import java.util.Map; import java.util.Set; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.ResultURLs; diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 3fa8cdd00..f14e71225 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -28,12 +28,12 @@ import java.io.IOException; import java.util.Date; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.Request; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.peers.Protocol; import net.yacy.search.Switchboard; @@ -65,7 +65,7 @@ public class urls { final long timeout = System.currentTimeMillis() + maxTime; int c = 0; Request entry; - DigestURI referrer; + DigestURL referrer; while ((maxCount > 0) && (System.currentTimeMillis() < timeout) && (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) { @@ -112,7 +112,7 @@ public class urls { final int count = urlhashes.length() / 12; int c = 0; URIMetadataNode entry; - DigestURI referrer; + DigestURL referrer; for (int i = 0; i < count; i++) { entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1)))); if (entry == null) continue; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 80e55f630..6e028f53e 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -42,9 +42,9 @@ import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; -import net.yacy.cora.document.RSSMessage; -import net.yacy.cora.document.UTF8; - +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.opensearch.OpenSearchConnector; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; @@ -62,7 +62,6 @@ import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.Formatter; @@ -606,7 +605,7 @@ public class yacysearch { return prop; } final String bookmarkHash = post.get("bookmarkref", ""); // urlhash - final DigestURI url = indexSegment.fulltext().getURL(UTF8.getBytes(bookmarkHash)); + final DigestURL url = indexSegment.fulltext().getURL(UTF8.getBytes(bookmarkHash)); if ( url != null ) { try { sb.tables.bookmarks.createBookmark( @@ -658,8 +657,8 @@ public class yacysearch { clustersearch && global ? QueryParams.Searchdom.CLUSTER : (global && indexReceiveGranted ? QueryParams.Searchdom.GLOBAL : QueryParams.Searchdom.LOCAL), constraint, true, - DigestURI.hosthashess(sb.getConfig("search.excludehosth", "")), - DigestURI.TLD_any_zone_filter, + DigestURL.hosthashess(sb.getConfig("search.excludehosth", "")), + DigestURL.TLD_any_zone_filter, client, authenticated, indexSegment, diff --git a/htroot/yacysearch_location.java b/htroot/yacysearch_location.java index 3112a7aaf..7a55b606d 100644 --- a/htroot/yacysearch_location.java +++ b/htroot/yacysearch_location.java @@ -23,7 +23,7 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; -import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.federate.opensearch.SRURSSConnector; import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.protocol.Domains; diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index f37897e14..570dc5d2a 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -28,11 +28,12 @@ import java.net.MalformedURLException; import java.util.List; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; @@ -41,7 +42,6 @@ import net.yacy.cora.protocol.RequestHeader.FileType; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.Cache; import net.yacy.data.URLLicense; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.Formatter; import net.yacy.peers.NewsPool; import net.yacy.peers.Seed; @@ -116,13 +116,13 @@ public class yacysearchitem { final ResultEntry result = theSearch.oneResult(item, timeout); if (result == null) return prop; // no content final String resultUrlstring = result.urlstring(); - final DigestURI resultURL = result.url(); + final DigestURL resultURL = result.url(); final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self"); final int port = resultURL.getPort(); - DigestURI faviconURL = null; + DigestURL faviconURL = null; if ((fileType == FileType.HTML || fileType == FileType.JSON) && !sb.isIntranetMode()) try { - faviconURL = new DigestURI(resultURL.getProtocol() + "://" + resultURL.getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico"); + faviconURL = new DigestURL(resultURL.getProtocol() + "://" + resultURL.getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico"); } catch (final MalformedURLException e1) { ConcurrentLog.logException(e1); faviconURL = null; @@ -166,7 +166,7 @@ public class yacysearchitem { // check if url is allowed to view if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) { try { - if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI (modifyURL)) == null) { + if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL (modifyURL)) == null) { modifyURL = "./proxy.html?url="+modifyURL; } } catch (final MalformedURLException e) { @@ -177,7 +177,7 @@ public class yacysearchitem { if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("yacy")) { try { - if ((new DigestURI (modifyURL).getHost().endsWith(".yacy"))) { + if ((new DigestURL (modifyURL).getHost().endsWith(".yacy"))) { modifyURL = "./proxy.html?url="+modifyURL; } } catch (final MalformedURLException e) { @@ -245,7 +245,7 @@ public class yacysearchitem { prop.put("content_heuristic_name", heuristic.heuristicName); } EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theSearch.query.id(true), SearchEventType.FINALIZATION, "" + item, 0, 0), false); - final String ext = MultiProtocolURI.getFileExtension(resultFileName).toLowerCase(); + final String ext = MultiProtocolURL.getFileExtension(resultFileName).toLowerCase(); if (ext.equals("png") || ext.equals("jpg") || ext.equals("gif")) { final String license = URLLicense.aquireLicense(resultURL); prop.put("content_code", license); diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index 3214e666d..ac576f339 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -27,8 +27,8 @@ import java.util.Iterator; import java.util.Map; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ScoreMap; @@ -346,7 +346,7 @@ public class yacysearchtrailer { if (count == 0) { break; } - nav = "%2Fvocabulary%2F" + navname + "%2F" + MultiProtocolURI.escape(Tagging.encodePrintname(name)).toString(); + nav = "%2Fvocabulary%2F" + navname + "%2F" + MultiProtocolURL.escape(Tagging.encodePrintname(name)).toString(); queryStringForUrl = theSearch.query.getQueryGoal().getOriginalQueryString(true); p = queryStringForUrl.indexOf(nav); if (p < 0) { diff --git a/source/net/yacy/contentcontrol/SMWListSyncThread.java b/source/net/yacy/contentcontrol/SMWListSyncThread.java index 97ef5989d..b4d211155 100644 --- a/source/net/yacy/contentcontrol/SMWListSyncThread.java +++ b/source/net/yacy/contentcontrol/SMWListSyncThread.java @@ -5,7 +5,7 @@ import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; diff --git a/source/net/yacy/cora/document/analysis/Classification.java b/source/net/yacy/cora/document/analysis/Classification.java index 61bfd5e31..470c1e98f 100644 --- a/source/net/yacy/cora/document/analysis/Classification.java +++ b/source/net/yacy/cora/document/analysis/Classification.java @@ -28,7 +28,7 @@ import java.util.Map.Entry; import java.util.Properties; import java.util.Set; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.CommonPattern; public class Classification { @@ -200,11 +200,11 @@ public class Classification { return ext == null ? "application/octet-stream" : mimeTable.getProperty(ext.toLowerCase(), dfltMime); } - public static String url2mime(final MultiProtocolURI url, final String dfltMime) { - return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()), dfltMime); + public static String url2mime(final MultiProtocolURL url, final String dfltMime) { + return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURL.getFileExtension(url.getFileName()), dfltMime); } - public static String url2mime(final MultiProtocolURI url) { - return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURI.getFileExtension(url.getFileName())); + public static String url2mime(final MultiProtocolURL url) { + return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURL.getFileExtension(url.getFileName())); } } diff --git a/source/net/yacy/cora/document/ASCII.java b/source/net/yacy/cora/document/encoding/ASCII.java similarity index 99% rename from source/net/yacy/cora/document/ASCII.java rename to source/net/yacy/cora/document/encoding/ASCII.java index 578616fa6..fe4d88b07 100644 --- a/source/net/yacy/cora/document/ASCII.java +++ b/source/net/yacy/cora/document/encoding/ASCII.java @@ -24,7 +24,7 @@ * If not, see . */ -package net.yacy.cora.document; +package net.yacy.cora.document.encoding; import java.util.Comparator; diff --git a/source/net/yacy/cora/document/UTF8.java b/source/net/yacy/cora/document/encoding/UTF8.java similarity index 99% rename from source/net/yacy/cora/document/UTF8.java rename to source/net/yacy/cora/document/encoding/UTF8.java index 4f098402f..1d6de94a1 100644 --- a/source/net/yacy/cora/document/UTF8.java +++ b/source/net/yacy/cora/document/encoding/UTF8.java @@ -22,7 +22,7 @@ * If not, see . */ -package net.yacy.cora.document; +package net.yacy.cora.document.encoding; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; diff --git a/source/net/yacy/cora/document/Channel.java b/source/net/yacy/cora/document/feed/Channel.java similarity index 97% rename from source/net/yacy/cora/document/Channel.java rename to source/net/yacy/cora/document/feed/Channel.java index 337284af6..2200ae3ff 100644 --- a/source/net/yacy/cora/document/Channel.java +++ b/source/net/yacy/cora/document/feed/Channel.java @@ -24,7 +24,8 @@ * If not, see . */ -package net.yacy.cora.document; +package net.yacy.cora.document.feed; + public interface Channel extends Iterable { diff --git a/source/net/yacy/cora/document/Channels.java b/source/net/yacy/cora/document/feed/Channels.java similarity index 96% rename from source/net/yacy/cora/document/Channels.java rename to source/net/yacy/cora/document/feed/Channels.java index f2cb6f58f..d601486f1 100644 --- a/source/net/yacy/cora/document/Channels.java +++ b/source/net/yacy/cora/document/feed/Channels.java @@ -24,7 +24,7 @@ * If not, see . */ -package net.yacy.cora.document; +package net.yacy.cora.document.feed; public class Channels { diff --git a/source/net/yacy/cora/document/Hit.java b/source/net/yacy/cora/document/feed/Hit.java similarity index 98% rename from source/net/yacy/cora/document/Hit.java rename to source/net/yacy/cora/document/feed/Hit.java index a8b5bc2b0..51a8afb23 100644 --- a/source/net/yacy/cora/document/Hit.java +++ b/source/net/yacy/cora/document/feed/Hit.java @@ -24,7 +24,7 @@ * If not, see . */ -package net.yacy.cora.document; +package net.yacy.cora.document.feed; import java.util.Date; import java.util.List; diff --git a/source/net/yacy/cora/document/RSSFeed.java b/source/net/yacy/cora/document/feed/RSSFeed.java similarity index 92% rename from source/net/yacy/cora/document/RSSFeed.java rename to source/net/yacy/cora/document/feed/RSSFeed.java index e39a66113..d0ab89f56 100644 --- a/source/net/yacy/cora/document/RSSFeed.java +++ b/source/net/yacy/cora/document/feed/RSSFeed.java @@ -18,7 +18,7 @@ * If not, see . */ -package net.yacy.cora.document; +package net.yacy.cora.document.feed; import java.net.MalformedURLException; import java.util.Collections; @@ -29,6 +29,7 @@ import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.ConcurrentLog; public class RSSFeed implements Iterable { @@ -53,11 +54,11 @@ public class RSSFeed implements Iterable { * @param links * @param source */ - public RSSFeed(Set links, String source) { + public RSSFeed(Set links, String source) { this(Integer.MAX_VALUE); String u; RSSMessage message; - for (MultiProtocolURI uri: links) { + for (MultiProtocolURL uri: links) { u = uri.toNormalform(true); message = new RSSMessage(u, "", u); message.setAuthor(source); @@ -81,10 +82,10 @@ public class RSSFeed implements Iterable { return this.imageURL; } - public Set getLinks() { - Set links = new HashSet(); + public Set getLinks() { + Set links = new HashSet(); for (RSSMessage message: this.messages.values()) { - try {links.add(new MultiProtocolURI(message.getLink()));} catch (final MalformedURLException e) {} + try {links.add(new MultiProtocolURL(message.getLink()));} catch (final MalformedURLException e) {} } return links; } diff --git a/source/net/yacy/cora/document/RSSMessage.java b/source/net/yacy/cora/document/feed/RSSMessage.java similarity index 98% rename from source/net/yacy/cora/document/RSSMessage.java rename to source/net/yacy/cora/document/feed/RSSMessage.java index 1904d4ccc..ee39fe545 100644 --- a/source/net/yacy/cora/document/RSSMessage.java +++ b/source/net/yacy/cora/document/feed/RSSMessage.java @@ -22,7 +22,7 @@ * If not, see . */ -package net.yacy.cora.document; +package net.yacy.cora.document.feed; import java.text.ParseException; import java.util.ArrayList; @@ -37,6 +37,7 @@ import java.util.Set; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.ISO8601Formatter; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.lod.vocabulary.DublinCore; import net.yacy.cora.lod.vocabulary.Geo; import net.yacy.cora.protocol.HeaderFramework; @@ -123,7 +124,7 @@ public class RSSMessage implements Hit, Comparable, Comparator(); if (title.length() > 0) this.map.put(Token.title.name(), title); if (description.length() > 0) this.map.put(Token.description.name(), description); diff --git a/source/net/yacy/cora/document/RSSReader.java b/source/net/yacy/cora/document/feed/RSSReader.java similarity index 98% rename from source/net/yacy/cora/document/RSSReader.java rename to source/net/yacy/cora/document/feed/RSSReader.java index b89a96a31..fdf2919b8 100644 --- a/source/net/yacy/cora/document/RSSReader.java +++ b/source/net/yacy/cora/document/feed/RSSReader.java @@ -18,7 +18,7 @@ * If not, see . */ -package net.yacy.cora.document; +package net.yacy.cora.document.feed; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; @@ -30,7 +30,8 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import net.yacy.cora.document.RSSMessage.Token; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.RSSMessage.Token; import org.xml.sax.Attributes; import org.xml.sax.EntityResolver; diff --git a/source/net/yacy/cora/document/id/AnchorURL.java b/source/net/yacy/cora/document/id/AnchorURL.java new file mode 100644 index 000000000..d9c01065f --- /dev/null +++ b/source/net/yacy/cora/document/id/AnchorURL.java @@ -0,0 +1,68 @@ +/** + * Anchor + * Copyright 2013 by Michael Peter Christen + * first published 15.09.2013 on http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.document.id; + +import java.net.MalformedURLException; +import java.util.Properties; + +public class AnchorURL extends DigestURL { + + private static final long serialVersionUID = 1586579902179962086L; + + private Properties properties; // may contain additional url properties, such as given in html a href-links + + public AnchorURL(final String url) throws MalformedURLException { + super(url); + this.properties = new Properties(); + } + + public AnchorURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException { + super(baseURL, relPath); + this.properties = new Properties(); + } + + public AnchorURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException { + super(protocol, host, port, path); + this.properties = new Properties(); + } + + public Properties getProperties() { + return this.properties; + } + + public static AnchorURL newAnchor(final DigestURL baseURL, String relPath) throws MalformedURLException { + if (relPath.startsWith("//")) { + // patch for urls starting with "//" which can be found in the wild + relPath = (baseURL == null) ? "http:" + relPath : baseURL.getProtocol() + ":" + relPath; + } + if ((baseURL == null) || + isHTTP(relPath) || + isHTTPS(relPath) || + isFTP(relPath) || + isFile(relPath) || + isSMB(relPath)/*|| + relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) { + return new AnchorURL(relPath); + } + return new AnchorURL(baseURL, relPath); + } +} diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/cora/document/id/DigestURL.java similarity index 77% rename from source/net/yacy/kelondro/data/meta/DigestURI.java rename to source/net/yacy/cora/document/id/DigestURL.java index debb0b7e3..48e9cee6c 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/cora/document/id/DigestURL.java @@ -1,28 +1,24 @@ -// DigestURI.java -// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 13.07.2006 on http://yacy.net -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package net.yacy.kelondro.data.meta; +/** + * DigestURL + * Copyright 2006 by Michael Peter Christen + * first published 13.07.2006 on http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.document.id; // this class exist to provide a system-wide normal form representation of urls, // and to prevent that java.net.URL usage causes DNS queries which are used in java.net. @@ -31,18 +27,17 @@ import java.io.File; import java.io.Serializable; import java.net.MalformedURLException; import java.util.HashSet; +import java.util.Properties; import java.util.Set; import java.util.regex.Pattern; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; import net.yacy.cora.protocol.Domains; import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.index.RowHandleSet; /** * URI-object providing YaCy-hash computation @@ -51,13 +46,14 @@ import net.yacy.kelondro.index.RowHandleSet; * For URIs pointing to resources not globally available, * the domainhash-part gets one reserved value */ -public class DigestURI extends MultiProtocolURI implements Serializable { +public class DigestURL extends MultiProtocolURL implements Serializable { private static final long serialVersionUID = -1173233022912141885L; public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter // class variables private byte[] hash; + private Properties properties; // may contain additional url properties, such as given in html a href-links /** * Shortcut, calculate hash for shorted url/hostname @@ -67,9 +63,9 @@ public class DigestURI extends MultiProtocolURI implements Serializable { public static String hosthash(final String host) { String h = host; if (!h.startsWith("http://")) h = "http://" + h; - DigestURI url = null; + DigestURL url = null; try { - url = new DigestURI(h); + url = new DigestURL(h); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); return null; @@ -111,16 +107,17 @@ public class DigestURI extends MultiProtocolURI implements Serializable { /** * DigestURI from File */ - public DigestURI(final File file) throws MalformedURLException { + public DigestURL(final File file) throws MalformedURLException { this("file", "", -1, file.getAbsolutePath()); } /** * DigestURI from URI string */ - public DigestURI(final String url) throws MalformedURLException { + public DigestURL(final String url) throws MalformedURLException { super(url); this.hash = null; + this.properties = new Properties(); } /** @@ -129,43 +126,36 @@ public class DigestURI extends MultiProtocolURI implements Serializable { * @param hash already calculated hash for url * @throws MalformedURLException */ - public DigestURI(final String url, final byte[] hash) throws MalformedURLException { + public DigestURL(final String url, final byte[] hash) throws MalformedURLException { super(url); this.hash = hash; + this.properties = new Properties(); } - - /** - * DigestURI from general URI - * @param u - */ - /* - private DigestURI(final MultiProtocolURI u) { - super(u); - this.hash = (u instanceof DigestURI) ? ((DigestURI) u).hash : null; - } - */ /** * DigestURI from general URI, hash already calculated * @param baseURL * @param hash */ - public DigestURI(final MultiProtocolURI baseURL, final byte[] hash) { + public DigestURL(final MultiProtocolURL baseURL, final byte[] hash) { super(baseURL); this.hash = hash; + this.properties = new Properties(); } - public DigestURI(final MultiProtocolURI baseURL, final String relPath) throws MalformedURLException { + public DigestURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException { super(baseURL, relPath); this.hash = null; + this.properties = new Properties(); } - public DigestURI(final String protocol, final String host, final int port, final String path) throws MalformedURLException { + public DigestURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException { super(protocol, host, port, path); this.hash = null; + this.properties = new Properties(); } - public static DigestURI newURL(final DigestURI baseURL, String relPath) throws MalformedURLException { + public static DigestURL newURL(final DigestURL baseURL, String relPath) throws MalformedURLException { if (relPath.startsWith("//")) { // patch for urls starting with "//" which can be found in the wild relPath = (baseURL == null) ? "http:" + relPath : baseURL.getProtocol() + ":" + relPath; @@ -177,13 +167,17 @@ public class DigestURI extends MultiProtocolURI implements Serializable { isFile(relPath) || isSMB(relPath)/*|| relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) { - return new DigestURI(relPath); + return new DigestURL(relPath); } - return new DigestURI(baseURL, relPath); + return new DigestURL(baseURL, relPath); } private int hashCache = Integer.MIN_VALUE; // if this is used in a compare method many times, a cache is useful + public Properties getProperties() { + return this.properties; + } + @Override public int hashCode() { if (this.hashCache == Integer.MIN_VALUE) { @@ -302,25 +296,6 @@ public class DigestURI extends MultiProtocolURI implements Serializable { public final boolean probablyRootURL() { return this.path.length() <= 1 || rootPattern.matcher(this.path).matches(); } - - public RowHandleSet getPossibleRootHashes() { - RowHandleSet rootCandidates = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); - String rootStub = this.getProtocol() + "://" + this.getHost(); - try { - rootCandidates.put(new DigestURI(rootStub).hash()); - rootCandidates.put(new DigestURI(rootStub + "/").hash()); - rootCandidates.put(new DigestURI(rootStub + "/index.htm").hash()); - rootCandidates.put(new DigestURI(rootStub + "/index.html").hash()); - rootCandidates.put(new DigestURI(rootStub + "/index.php").hash()); - rootCandidates.put(new DigestURI(rootStub + "/home.htm").hash()); - rootCandidates.put(new DigestURI(rootStub + "/home.html").hash()); - rootCandidates.put(new DigestURI(rootStub + "/home.php").hash()); - rootCandidates.put(new DigestURI(rootStub + "/default.htm").hash()); - rootCandidates.put(new DigestURI(rootStub + "/default.html").hash()); - rootCandidates.put(new DigestURI(rootStub + "/default.php").hash()); - } catch (final Throwable e) {} - return rootCandidates; - } private static final String hosthash5(final String protocol, final String host, final int port) { if (host == null) { diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java similarity index 98% rename from source/net/yacy/cora/document/MultiProtocolURI.java rename to source/net/yacy/cora/document/id/MultiProtocolURL.java index 950f077f4..bff784868 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -23,7 +23,7 @@ */ -package net.yacy.cora.document; +package net.yacy.cora.document.id; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; @@ -47,9 +47,10 @@ import java.util.regex.Pattern; import jcifs.smb.SmbException; import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; -import net.yacy.cora.document.Punycode.PunycodeException; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.Punycode.PunycodeException; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.TimeoutRequest; @@ -61,9 +62,9 @@ import net.yacy.cora.util.CommonPattern; * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file * */ -public class MultiProtocolURI implements Serializable, Comparable { +public class MultiProtocolURL implements Serializable, Comparable { - public static final MultiProtocolURI POISON = new MultiProtocolURI(); // poison pill for concurrent link generators + public static final MultiProtocolURL POISON = new MultiProtocolURL(); // poison pill for concurrent link generators private static final Pattern ampPattern = Pattern.compile(Pattern.quote("&")); private static final long serialVersionUID = -1173233022912141884L; @@ -96,7 +97,7 @@ public class MultiProtocolURI implements Serializable, Comparable= 0 && host.charAt(0) != '[') host = '[' + host + ']'; // IPv6 host must be enclosed in square brackets this.protocol = protocol; @@ -948,8 +949,8 @@ public class MultiProtocolURI implements Serializable, Comparable uniqueURLs, SolrInputDocument sid, DigestURI url) { + public boolean postprocessing_doublecontent(Segment segment, Set uniqueURLs, SolrInputDocument sid, DigestURL url) { boolean changed = false; // FIND OUT IF THIS IS A DOUBLE DOCUMENT String hostid = url.hosthash(); @@ -149,7 +149,7 @@ public class SchemaConfiguration extends Configuration implements Serializable { return changed; } - public boolean postprocessing_clickdepth(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, SchemaDeclaration clickdepthfield) { + public boolean postprocessing_clickdepth(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURL url, SchemaDeclaration clickdepthfield) { if (!this.contains(clickdepthfield)) return false; // get new click depth and compare with old Integer oldclickdepth = (Integer) doc.getFieldValue(clickdepthfield.getSolrFieldName()); @@ -165,7 +165,7 @@ public class SchemaConfiguration extends Configuration implements Serializable { return false; } - public boolean postprocessing_references(ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map hostExtentCount) { + public boolean postprocessing_references(ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURL url, Map hostExtentCount) { if (!(this.contains(CollectionSchema.references_i) || this.contains(CollectionSchema.references_internal_i) || this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false; diff --git a/source/net/yacy/cora/federate/solr/SolrServlet.java b/source/net/yacy/cora/federate/solr/SolrServlet.java index 6e3ebca81..05506615f 100644 --- a/source/net/yacy/cora/federate/solr/SolrServlet.java +++ b/source/net/yacy/cora/federate/solr/SolrServlet.java @@ -40,10 +40,9 @@ import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; -import org.apache.lucene.document.Document; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.MultiMapSolrParams; import org.apache.solr.core.SolrCore; @@ -165,7 +164,7 @@ public class SolrServlet implements Filter { int sz = ids.size(); for (int i = 0; i < sz; i++) { int id = iterator.nextDoc(); - Document doc = searcher.doc(id); + searcher.doc(id); } } } diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index 1795899e3..67b7a321b 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -33,7 +33,7 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.util.LookAheadIterator; diff --git a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java index 89d0faa4d..47fae1aab 100644 --- a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java @@ -30,7 +30,7 @@ import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; diff --git a/source/net/yacy/cora/federate/solr/connector/ShardSelection.java b/source/net/yacy/cora/federate/solr/connector/ShardSelection.java index 16aeb3f1a..f317ab9d0 100644 --- a/source/net/yacy/cora/federate/solr/connector/ShardSelection.java +++ b/source/net/yacy/cora/federate/solr/connector/ShardSelection.java @@ -27,7 +27,7 @@ import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.concurrent.atomic.AtomicLong; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.util.ConcurrentLog; import net.yacy.search.schema.CollectionSchema; diff --git a/source/net/yacy/cora/federate/solr/instance/RemoteInstance.java b/source/net/yacy/cora/federate/solr/instance/RemoteInstance.java index 3cbcc53b6..f9fa62cef 100644 --- a/source/net/yacy/cora/federate/solr/instance/RemoteInstance.java +++ b/source/net/yacy/cora/federate/solr/instance/RemoteInstance.java @@ -28,7 +28,7 @@ import java.util.Collection; import java.util.HashMap; import java.util.Map; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.Domains; import net.yacy.cora.util.ConcurrentLog; import net.yacy.search.schema.CollectionSchema; @@ -113,9 +113,9 @@ public class RemoteInstance implements SolrInstance { // Make a http client, connect using authentication. An url like // http://127.0.0.1:8983/solr/shard0 // is proper, and contains the core name as last element in the path - final MultiProtocolURI u; + final MultiProtocolURL u; try { - u = new MultiProtocolURI(this.solrurl + this.defaultCoreName); + u = new MultiProtocolURL(this.solrurl + this.defaultCoreName); } catch (final MalformedURLException e) { throw new IOException(e.getMessage()); } @@ -222,9 +222,9 @@ public class RemoteInstance implements SolrInstance { if (s != null) return s; // create new http server if (this.client != null) { - final MultiProtocolURI u; + final MultiProtocolURL u; try { - u = new MultiProtocolURI(this.solrurl + name); + u = new MultiProtocolURL(this.solrurl + name); } catch (final MalformedURLException e) { return null; } diff --git a/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java index 3a266d43b..27301eef5 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java @@ -31,8 +31,8 @@ import java.util.List; import java.util.Map; import java.util.Set; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.lod.vocabulary.DublinCore; import net.yacy.cora.lod.vocabulary.Geo; import net.yacy.cora.lod.vocabulary.YaCyMetadata; @@ -181,7 +181,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter { String u = value.stringValue(); solitaireTag(writer, RSSMessage.Token.link.name(), u); try { - MultiProtocolURI url = new MultiProtocolURI(u); + MultiProtocolURL url = new MultiProtocolURL(u); solitaireTag(writer, YaCyMetadata.host.getURIref(), url.getHost()); solitaireTag(writer, YaCyMetadata.path.getURIref(), url.getPath()); solitaireTag(writer, YaCyMetadata.file.getURIref(), url.getFileName()); diff --git a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java index edd1f2a66..16e4eab90 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java @@ -29,7 +29,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter.ResHead; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.data.URLLicense; @@ -134,7 +134,7 @@ public class YJsonResponseWriter implements QueryResponseWriter { List fields = doc.getFields(); int fieldc = fields.size(); List texts = new ArrayList(); - MultiProtocolURI url = null; + MultiProtocolURL url = null; String urlhash = null; List descriptions = new ArrayList(); String title = ""; @@ -153,12 +153,12 @@ public class YJsonResponseWriter implements QueryResponseWriter { if (CollectionSchema.sku.getSolrFieldName().equals(fieldName)) { String u = value.stringValue(); try { - url = new MultiProtocolURI(u); + url = new MultiProtocolURL(u); String filename = url.getFileName(); solitaireTag(writer, "link", u); solitaireTag(writer, "file", filename); // get image license - if (MultiProtocolURI.isImage(filename)) URLLicense.aquireLicense(urlhash, url.toNormalform(true)); + if (MultiProtocolURL.isImage(filename)) URLLicense.aquireLicense(urlhash, url.toNormalform(true)); } catch (final MalformedURLException e) {} continue; } diff --git a/source/net/yacy/cora/federate/yacy/Distribution.java b/source/net/yacy/cora/federate/yacy/Distribution.java index b472bb2ee..9382dd133 100644 --- a/source/net/yacy/cora/federate/yacy/Distribution.java +++ b/source/net/yacy/cora/federate/yacy/Distribution.java @@ -20,8 +20,8 @@ package net.yacy.cora.federate.yacy; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; /** diff --git a/source/net/yacy/cora/federate/yacy/Peer.java b/source/net/yacy/cora/federate/yacy/Peer.java index 6b359c65a..ee6503685 100644 --- a/source/net/yacy/cora/federate/yacy/Peer.java +++ b/source/net/yacy/cora/federate/yacy/Peer.java @@ -23,7 +23,7 @@ package net.yacy.cora.federate.yacy; import java.io.Serializable; import java.util.HashMap; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.Base64Order; /** diff --git a/source/net/yacy/cora/federate/yacy/Peers.java b/source/net/yacy/cora/federate/yacy/Peers.java index 238dd8892..a3faeb680 100644 --- a/source/net/yacy/cora/federate/yacy/Peers.java +++ b/source/net/yacy/cora/federate/yacy/Peers.java @@ -29,7 +29,7 @@ import java.util.Random; import java.util.Set; import java.util.TreeMap; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.federate.yacy.api.Network; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.http.HTTPClient; diff --git a/source/net/yacy/cora/geo/GeoLocation.java b/source/net/yacy/cora/geo/GeoLocation.java index 45cb68be5..2d180ed46 100644 --- a/source/net/yacy/cora/geo/GeoLocation.java +++ b/source/net/yacy/cora/geo/GeoLocation.java @@ -24,7 +24,7 @@ package net.yacy.cora.geo; import java.util.Comparator; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; public class GeoLocation extends IntegerGeoPoint implements Comparable, Comparator { diff --git a/source/net/yacy/cora/lod/JenaTripleStore.java b/source/net/yacy/cora/lod/JenaTripleStore.java index d7ecdad2c..a4f93b60f 100644 --- a/source/net/yacy/cora/lod/JenaTripleStore.java +++ b/source/net/yacy/cora/lod/JenaTripleStore.java @@ -14,7 +14,7 @@ import java.util.Iterator; import java.util.Map.Entry; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.lod.vocabulary.DCTerms; import net.yacy.cora.lod.vocabulary.Geo; import net.yacy.cora.lod.vocabulary.HttpHeader; diff --git a/source/net/yacy/cora/lod/Literal.java b/source/net/yacy/cora/lod/Literal.java index c5794c5c2..94b0bea70 100644 --- a/source/net/yacy/cora/lod/Literal.java +++ b/source/net/yacy/cora/lod/Literal.java @@ -26,7 +26,7 @@ package net.yacy.cora.lod; import java.util.regex.Pattern; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.MultiProtocolURL; /** * A literal is the possible value for a predicate. @@ -55,7 +55,7 @@ public interface Literal { * assigned. * @return an url to a knowledge authority for the literal */ - public MultiProtocolURI getSubject(); + public MultiProtocolURL getSubject(); /** * if a resource is poorly annotated with metadata an it shall diff --git a/source/net/yacy/cora/lod/Node.java b/source/net/yacy/cora/lod/Node.java index d1b0fc34a..c2f279b70 100644 --- a/source/net/yacy/cora/lod/Node.java +++ b/source/net/yacy/cora/lod/Node.java @@ -27,7 +27,7 @@ package net.yacy.cora.lod; import java.util.HashMap; import java.util.Map; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.lod.vocabulary.Rdf; /** diff --git a/source/net/yacy/cora/lod/vocabulary/CreativeCommons.java b/source/net/yacy/cora/lod/vocabulary/CreativeCommons.java index 97950651a..d21e11bf2 100644 --- a/source/net/yacy/cora/lod/vocabulary/CreativeCommons.java +++ b/source/net/yacy/cora/lod/vocabulary/CreativeCommons.java @@ -29,7 +29,7 @@ import java.util.HashSet; import java.util.Set; import java.util.regex.Pattern; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.lod.Literal; import net.yacy.cora.lod.Vocabulary; @@ -74,7 +74,7 @@ public enum CreativeCommons implements Vocabulary { Sharing("Sharing", "http://creativecommons.org/ns#Permission", ".*"); String terminal; - MultiProtocolURI subject; + MultiProtocolURL subject; Pattern discoveryPattern; private PermitLiteral( @@ -83,7 +83,7 @@ public enum CreativeCommons implements Vocabulary { String discoveryPattern) { this.terminal = terminal; try { - this.subject = subject == null ? null : new MultiProtocolURI(subject); + this.subject = subject == null ? null : new MultiProtocolURL(subject); } catch (final MalformedURLException e) { this.subject = null; } @@ -94,7 +94,7 @@ public enum CreativeCommons implements Vocabulary { public String getTerminal() { return this.terminal; } @Override - public MultiProtocolURI getSubject() { return this.subject; } + public MultiProtocolURL getSubject() { return this.subject; } @Override public Pattern getDiscoveryPattern() { return this.discoveryPattern; } @@ -110,7 +110,7 @@ public enum CreativeCommons implements Vocabulary { LesserCopyleft("Lesser Copyleft", "http://creativecommons.org/ns#Requirement", ".*"); String terminal; - MultiProtocolURI subject; + MultiProtocolURL subject; Pattern discoveryPattern; private RequirementLiteral( @@ -119,7 +119,7 @@ public enum CreativeCommons implements Vocabulary { String discoveryPattern) { this.terminal = terminal; try { - this.subject = subject == null ? null : new MultiProtocolURI(subject); + this.subject = subject == null ? null : new MultiProtocolURL(subject); } catch (final MalformedURLException e) { this.subject = null; } @@ -130,7 +130,7 @@ public enum CreativeCommons implements Vocabulary { public String getTerminal() { return this.terminal; } @Override - public MultiProtocolURI getSubject() { return this.subject; } + public MultiProtocolURL getSubject() { return this.subject; } @Override public Pattern getDiscoveryPattern() { return this.discoveryPattern; } @@ -142,7 +142,7 @@ public enum CreativeCommons implements Vocabulary { HighIncomeNationUse("High Income Nation Use", "http://creativecommons.org/ns#Prohibition", ".*"); String terminal; - MultiProtocolURI subject; + MultiProtocolURL subject; Pattern discoveryPattern; private ProhibitionLiteral( @@ -151,7 +151,7 @@ public enum CreativeCommons implements Vocabulary { String discoveryPattern) { this.terminal = terminal; try { - this.subject = subject == null ? null : new MultiProtocolURI(subject); + this.subject = subject == null ? null : new MultiProtocolURL(subject); } catch (final MalformedURLException e) { this.subject = null; } @@ -162,7 +162,7 @@ public enum CreativeCommons implements Vocabulary { public String getTerminal() { return this.terminal; } @Override - public MultiProtocolURI getSubject() { return this.subject; } + public MultiProtocolURL getSubject() { return this.subject; } @Override public Pattern getDiscoveryPattern() { return this.discoveryPattern; } diff --git a/source/net/yacy/cora/lod/vocabulary/YaCyMetadata.java b/source/net/yacy/cora/lod/vocabulary/YaCyMetadata.java index ee7662761..2b8ecde4d 100644 --- a/source/net/yacy/cora/lod/vocabulary/YaCyMetadata.java +++ b/source/net/yacy/cora/lod/vocabulary/YaCyMetadata.java @@ -27,7 +27,7 @@ package net.yacy.cora.lod.vocabulary; import java.util.Set; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.lod.Literal; import net.yacy.cora.lod.Vocabulary; diff --git a/source/net/yacy/cora/order/Base64Order.java b/source/net/yacy/cora/order/Base64Order.java index 2a0d6dab8..64d5eeec4 100644 --- a/source/net/yacy/cora/order/Base64Order.java +++ b/source/net/yacy/cora/order/Base64Order.java @@ -23,7 +23,7 @@ package net.yacy.cora.order; import java.io.Serializable; import java.util.Comparator; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; //ATTENTION! THIS CLASS SHALL NOT IMPORT FROM OTHER PACKAGES THAN CORA AND JRE //BECAUSE OTHERWISE THE DEBIAN INSTALLER FAILS! diff --git a/source/net/yacy/cora/order/Digest.java b/source/net/yacy/cora/order/Digest.java index 96f489f3f..619beaab8 100644 --- a/source/net/yacy/cora/order/Digest.java +++ b/source/net/yacy/cora/order/Digest.java @@ -38,7 +38,7 @@ import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.LinkedBlockingQueue; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ConcurrentARC; import net.yacy.cora.util.Memory; diff --git a/source/net/yacy/cora/order/StringOrder.java b/source/net/yacy/cora/order/StringOrder.java index 56a0493b7..56f230e80 100644 --- a/source/net/yacy/cora/order/StringOrder.java +++ b/source/net/yacy/cora/order/StringOrder.java @@ -23,7 +23,7 @@ package net.yacy.cora.order; import java.io.Serializable; import java.util.Comparator; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; public class StringOrder implements Comparator, Serializable { diff --git a/source/net/yacy/cora/protocol/HeaderFramework.java b/source/net/yacy/cora/protocol/HeaderFramework.java index c7352e399..43fb259a3 100644 --- a/source/net/yacy/cora/protocol/HeaderFramework.java +++ b/source/net/yacy/cora/protocol/HeaderFramework.java @@ -39,11 +39,11 @@ import java.util.TreeMap; import java.util.Vector; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.NumberTools; -import net.yacy.kelondro.data.meta.DigestURI; /** @@ -568,7 +568,7 @@ public class HeaderFramework extends TreeMap implements Map conProp) throws MalformedURLException { + public static DigestURL getRequestURL(final HashMap conProp) throws MalformedURLException { String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); final String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); // always starts with leading '/' final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given @@ -582,7 +582,7 @@ public class HeaderFramework extends TreeMap implements Map parts, final boolean usegzip) throws IOException { - final MultiProtocolURI url = new MultiProtocolURI(uri); + final MultiProtocolURL url = new MultiProtocolURL(uri); return POSTbytes(url, url.getHost(), parts, usegzip); } @@ -448,7 +448,7 @@ public class HTTPClient { * @return response body * @throws IOException */ - public byte[] POSTbytes(final MultiProtocolURI url, final String vhost, final Map post, final boolean usegzip) throws IOException { + public byte[] POSTbytes(final MultiProtocolURL url, final String vhost, final Map post, final boolean usegzip) throws IOException { final HttpPost httpPost = new HttpPost(url.toNormalform(true)); httpPost.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state @@ -480,7 +480,7 @@ public class HTTPClient { * @throws IOException */ public byte[] POSTbytes(final String uri, final InputStream instream, final long length) throws IOException { - final MultiProtocolURI url = new MultiProtocolURI(uri); + final MultiProtocolURL url = new MultiProtocolURL(uri); final HttpPost httpPost = new HttpPost(url.toNormalform(true)); httpPost.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state String host = url.getHost(); diff --git a/source/net/yacy/cora/protocol/http/LinkExtractor.java b/source/net/yacy/cora/protocol/http/LinkExtractor.java index 59b13c016..793549286 100644 --- a/source/net/yacy/cora/protocol/http/LinkExtractor.java +++ b/source/net/yacy/cora/protocol/http/LinkExtractor.java @@ -28,18 +28,18 @@ import java.net.MalformedURLException; import java.util.WeakHashMap; import java.util.regex.Pattern; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.MultiProtocolURL; public class LinkExtractor { private static final char lb = '<', rb = '>', dquotes = '"', space = ' '; private static final Object PRESENT = new Object(); - private WeakHashMap links; + private WeakHashMap links; private Pattern blackpattern; public LinkExtractor(Pattern blackpattern) { - this.links = new WeakHashMap(); + this.links = new WeakHashMap(); this.blackpattern = blackpattern; } @@ -55,7 +55,7 @@ public class LinkExtractor { if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above s = p + 1; if (this.blackpattern.matcher(u).matches()) continue; - try {links.put(new MultiProtocolURI(u), PRESENT);} catch (final MalformedURLException e) {} + try {links.put(new MultiProtocolURL(u), PRESENT);} catch (final MalformedURLException e) {} } } @@ -63,10 +63,10 @@ public class LinkExtractor { * return the links in the text in the order as they appear * @return a list of urls */ - public MultiProtocolURI[] getLinks() { - MultiProtocolURI[] urls = new MultiProtocolURI[this.links.size()]; + public MultiProtocolURL[] getLinks() { + MultiProtocolURL[] urls = new MultiProtocolURL[this.links.size()]; int i = 0; - for (MultiProtocolURI uri: this.links.keySet()) urls[i++] = uri; + for (MultiProtocolURL uri: this.links.keySet()) urls[i++] = uri; return urls; } diff --git a/source/net/yacy/cora/storage/AbstractMapStore.java b/source/net/yacy/cora/storage/AbstractMapStore.java index df030c3e2..899a472ad 100644 --- a/source/net/yacy/cora/storage/AbstractMapStore.java +++ b/source/net/yacy/cora/storage/AbstractMapStore.java @@ -30,7 +30,7 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; public abstract class AbstractMapStore implements MapStore { diff --git a/source/net/yacy/cora/storage/KeyList.java b/source/net/yacy/cora/storage/KeyList.java index c259e4e0b..37cccec6f 100644 --- a/source/net/yacy/cora/storage/KeyList.java +++ b/source/net/yacy/cora/storage/KeyList.java @@ -36,7 +36,7 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.zip.GZIPInputStream; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; /** * a key list is a file which contains a list of key words; each line one word diff --git a/source/net/yacy/cora/util/ByteArray.java b/source/net/yacy/cora/util/ByteArray.java index 5ea2ca74a..e117e9670 100644 --- a/source/net/yacy/cora/util/ByteArray.java +++ b/source/net/yacy/cora/util/ByteArray.java @@ -22,7 +22,7 @@ package net.yacy.cora.util; import java.util.HashMap; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; /** diff --git a/source/net/yacy/cora/util/ByteBuffer.java b/source/net/yacy/cora/util/ByteBuffer.java index 8ff26bf67..208d18103 100644 --- a/source/net/yacy/cora/util/ByteBuffer.java +++ b/source/net/yacy/cora/util/ByteBuffer.java @@ -27,7 +27,7 @@ import java.util.Collection; import java.util.Iterator; import java.util.List; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; public final class ByteBuffer extends OutputStream { diff --git a/source/net/yacy/cora/document/JSONArray.java b/source/net/yacy/cora/util/JSONArray.java similarity index 99% rename from source/net/yacy/cora/document/JSONArray.java rename to source/net/yacy/cora/util/JSONArray.java index 7590b1b95..9bbfc1779 100644 --- a/source/net/yacy/cora/document/JSONArray.java +++ b/source/net/yacy/cora/util/JSONArray.java @@ -22,7 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package net.yacy.cora.document; +package net.yacy.cora.util; import java.io.IOException; import java.io.Writer; diff --git a/source/net/yacy/cora/document/JSONException.java b/source/net/yacy/cora/util/JSONException.java similarity index 91% rename from source/net/yacy/cora/document/JSONException.java rename to source/net/yacy/cora/util/JSONException.java index 0aed962da..56cb23346 100644 --- a/source/net/yacy/cora/document/JSONException.java +++ b/source/net/yacy/cora/util/JSONException.java @@ -4,7 +4,7 @@ * @version 2008-09-18 */ -package net.yacy.cora.document; +package net.yacy.cora.util; public class JSONException extends Exception { /** diff --git a/source/net/yacy/cora/document/JSONObject.java b/source/net/yacy/cora/util/JSONObject.java similarity index 99% rename from source/net/yacy/cora/document/JSONObject.java rename to source/net/yacy/cora/util/JSONObject.java index 8afbbfdc4..c3ef229e0 100644 --- a/source/net/yacy/cora/document/JSONObject.java +++ b/source/net/yacy/cora/util/JSONObject.java @@ -22,7 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package net.yacy.cora.document; +package net.yacy.cora.util; import java.io.IOException; import java.io.Writer; diff --git a/source/net/yacy/cora/document/JSONTokener.java b/source/net/yacy/cora/util/JSONTokener.java similarity index 99% rename from source/net/yacy/cora/document/JSONTokener.java rename to source/net/yacy/cora/util/JSONTokener.java index e35f74111..2d68be092 100644 --- a/source/net/yacy/cora/document/JSONTokener.java +++ b/source/net/yacy/cora/util/JSONTokener.java @@ -30,7 +30,7 @@ SOFTWARE. * @version 2010-02-02 */ -package net.yacy.cora.document; +package net.yacy.cora.util; import java.io.BufferedReader; import java.io.IOException; diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index f00510ff8..67cbabeb5 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -40,8 +40,9 @@ import java.util.concurrent.ConcurrentMap; import org.openjena.atlas.logging.Log; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; @@ -55,7 +56,6 @@ import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.robots.RobotsTxt; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.Row; @@ -301,7 +301,7 @@ public class Balancer { * @param crawlURL * @return the sleep time in milliseconds; may be negative for no sleep time */ - private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) { + private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) { if (profileEntry == null) return 0; long sleeptime = ( profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY || @@ -319,7 +319,7 @@ public class Balancer { * @param crawlURL * @return */ - private long getRobotsTime(final RobotsTxt robots, final DigestURI crawlURL, ClientIdentification.Agent agent) { + private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) { long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server return sleeptime < 0 ? 0 : sleeptime; } diff --git a/source/net/yacy/crawler/CrawlQueue.java b/source/net/yacy/crawler/CrawlQueue.java index 3c6777479..ff6a4637e 100644 --- a/source/net/yacy/crawler/CrawlQueue.java +++ b/source/net/yacy/crawler/CrawlQueue.java @@ -24,8 +24,9 @@ import java.io.File; import java.io.IOException; import java.util.Iterator; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; @@ -37,7 +38,6 @@ import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.robots.RobotsTxt; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.Row; @@ -210,7 +210,7 @@ public class CrawlQueue { * @param crawlURL * @return the sleep time in milliseconds; may be negative for no sleep time */ - private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) { + private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) { if (profileEntry == null) return 0; long sleeptime = ( profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY || @@ -228,7 +228,7 @@ public class CrawlQueue { * @param crawlURL * @return */ - private long getRobotsTime(final RobotsTxt robots, final DigestURI crawlURL, ClientIdentification.Agent agent) { + private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) { long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server return sleeptime < 0 ? 0 : sleeptime; } diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index df0f49d34..d21b487dd 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -31,16 +31,16 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Locale; -import java.util.Map; -import java.util.Properties; import java.util.concurrent.BlockingQueue; import java.util.concurrent.atomic.AtomicInteger; import net.yacy.contentcontrol.ContentControlFilterUpdateThread; -import net.yacy.cora.document.ASCII; import net.yacy.cora.document.analysis.Classification.ContentDomain; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ftp.FTPClient; @@ -58,7 +58,6 @@ import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.SMBLoader; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.kelondro.data.citation.CitationReference; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.peers.SeedDB; @@ -167,7 +166,7 @@ public final class CrawlStacker { if (this.log.isFinest()) this.log.finest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth()); this.requestQueue.enQueue(entry); } - public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map hyperlinks) { + public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final List hyperlinks) { new Thread() { @Override public void run() { @@ -177,12 +176,11 @@ public final class CrawlStacker { }.start(); } - private void enqueueEntries(final byte[] initiator, final String profileHandle, final Map hyperlinks, final boolean replace) { - for (final Map.Entry e: hyperlinks.entrySet()) { - if (e.getKey() == null) continue; + private void enqueueEntries(final byte[] initiator, final String profileHandle, final List hyperlinks, final boolean replace) { + for (final DigestURL url: hyperlinks) { + if (url == null) continue; // delete old entry, if exists to force a re-load of the url (thats wanted here) - final DigestURI url = e.getKey(); final byte[] urlhash = url.hash(); if (replace) { this.indexSegment.fulltext().remove(urlhash); @@ -197,7 +195,7 @@ public final class CrawlStacker { u = u + "/index.html"; } try { - final byte[] uh = new DigestURI(u).hash(); + final byte[] uh = new DigestURL(u).hash(); this.indexSegment.fulltext().remove(uh); this.nextQueue.noticeURL.removeByURLHash(uh); this.nextQueue.errorURL.remove(uh); @@ -213,7 +211,7 @@ public final class CrawlStacker { initiator, url, null, - e.getValue().getProperty("name", ""), + url.getProperties().getProperty("name", ""), new Date(), profileHandle, 0, @@ -238,9 +236,9 @@ public final class CrawlStacker { while ((entry = queue.take()) != FTPClient.POISON_entryInfo) { // delete old entry, if exists to force a re-load of the url (thats wanted here) - DigestURI url = null; + DigestURL url = null; try { - url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name)); + url = new DigestURL("ftp://" + host + (port == 21 ? "" : ":" + port) + MultiProtocolURL.escape(entry.name)); } catch (final MalformedURLException e) { continue; } @@ -256,7 +254,7 @@ public final class CrawlStacker { initiator, url, null, - MultiProtocolURI.unescape(entry.name), + MultiProtocolURL.unescape(entry.name), entry.date, profileHandle, 0, @@ -277,7 +275,7 @@ public final class CrawlStacker { * @param url * @return null if successfull, a reason string if not successful */ - public String stackSimpleCrawl(final DigestURI url) { + public String stackSimpleCrawl(final DigestURL url) { final CrawlProfile pe = this.crawler.defaultSurrogateProfile; return stackCrawl(new Request( this.peers.mySeed().hash.getBytes(), @@ -373,7 +371,7 @@ public final class CrawlStacker { return null; } - public String checkAcceptance(final DigestURI url, final CrawlProfile profile, final int depth) { + public String checkAcceptance(final DigestURL url, final CrawlProfile profile, final int depth) { // check if the protocol is supported final String urlProtocol = url.getProtocol(); @@ -512,7 +510,7 @@ public final class CrawlStacker { * @param url * @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted */ - public String urlInAcceptedDomain(final DigestURI url) { + public String urlInAcceptedDomain(final DigestURL url) { // returns true if the url can be accepted according to network.unit.domain if (url == null) return "url is null"; // check domainList from network-definition @@ -560,7 +558,7 @@ public final class CrawlStacker { // returns true if the url can be accepted according to network.unit.domain if (urlhash == null) return "url is null"; // check if this is a local address and we are allowed to index local pages: - final boolean local = DigestURI.isLocal(urlhash); + final boolean local = DigestURL.isLocal(urlhash); if (this.acceptLocalURLs && local) return null; if (this.acceptGlobalURLs && !local) return null; return (local) ? diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index f721d49be..d2ecf0437 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -36,8 +36,8 @@ import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.NaturalOrder; diff --git a/source/net/yacy/crawler/data/Cache.java b/source/net/yacy/crawler/data/Cache.java index ec08d3c68..f1d72354f 100644 --- a/source/net/yacy/crawler/data/Cache.java +++ b/source/net/yacy/crawler/data/Cache.java @@ -41,7 +41,8 @@ import java.util.HashMap; import java.util.Map; import java.util.concurrent.BlockingQueue; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.storage.HandleSet; @@ -50,7 +51,6 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.blob.ArrayStack; import net.yacy.kelondro.blob.Compressor; import net.yacy.kelondro.blob.MapHeap; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; @@ -191,7 +191,7 @@ public final class Cache { fileDB.close(true); } - public static void store(final DigestURI url, final ResponseHeader responseHeader, final byte[] file) throws IOException { + public static void store(final DigestURL url, final ResponseHeader responseHeader, final byte[] file) throws IOException { if (maxCacheSize == 0) return; if (responseHeader == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: responseHeader == null"); if (file == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: file == null"); diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index a07bdf125..03efd21b9 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -26,17 +26,17 @@ package net.yacy.crawler.data; import java.text.DateFormat; +import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; -import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; @@ -560,13 +560,13 @@ public class CrawlProfile extends ConcurrentHashMap implements M return System.currentTimeMillis() - (60000L * oldTimeMinutes); } - public static String siteFilter(final Set uris) { + public static String siteFilter(final Collection uris) { final StringBuilder filter = new StringBuilder(); - for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri)); + for (final MultiProtocolURL uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri)); return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING; } - public static String mustMatchFilterFullDomain(final MultiProtocolURI uri) { + public static String mustMatchFilterFullDomain(final MultiProtocolURL uri) { String host = uri.getHost(); if (host == null) return uri.getProtocol() + ".*"; if (host.startsWith("www.")) host = host.substring(4); @@ -575,13 +575,13 @@ public class CrawlProfile extends ConcurrentHashMap implements M return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(".*").toString(); } - public static String subpathFilter(final Set uris) { + public static String subpathFilter(final Collection uris) { final StringBuilder filter = new StringBuilder(); - for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchSubpath(uri)); + for (final MultiProtocolURL uri: uris) filter.append('|').append(mustMatchSubpath(uri)); return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING; } - public static String mustMatchSubpath(final MultiProtocolURI uri) { + public static String mustMatchSubpath(final MultiProtocolURL uri) { String u = uri.toNormalform(true); if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);} return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString(); diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index bff9bce48..04ecfb924 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -35,10 +35,11 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Hit; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.Hit; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ConnectionInfo; @@ -49,7 +50,6 @@ import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.robots.RobotsTxtEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.workflow.WorkflowJob; import net.yacy.peers.DHTSelection; @@ -167,7 +167,7 @@ public class CrawlQueues { this.errorURL.remove(hash); } - public DigestURI getURL(final byte[] urlhash) { + public DigestURL getURL(final byte[] urlhash) { assert urlhash != null; if (urlhash == null || urlhash.length == 0) { return null; @@ -317,7 +317,7 @@ public class CrawlQueues { if (profile != null) { // check if the protocol is supported - final DigestURI url = urlEntry.url(); + final DigestURL url = urlEntry.url(); final String urlProtocol = url.getProtocol(); if (this.sb.loader.isSupportedProtocol(urlProtocol)) { if (this.log.isFine()) { @@ -502,19 +502,19 @@ public class CrawlQueues { } // parse the rss - DigestURI url, referrer; + DigestURL url, referrer; Date loaddate; for (final Hit item: feed) { //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate()); // put url on remote crawl stack try { - url = new DigestURI(item.getLink()); + url = new DigestURL(item.getLink()); } catch (final MalformedURLException e) { continue; } try { - referrer = new DigestURI(item.getReferrer()); + referrer = new DigestURL(item.getReferrer()); } catch (final MalformedURLException e) { referrer = null; } @@ -548,7 +548,7 @@ public class CrawlQueues { * @param url * @return */ - private static String urlToString(final DigestURI url) { + private static String urlToString(final DigestURL url) { return (url == null ? "null" : url.toNormalform(true)); } diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java index f6edbaca6..34bd026eb 100644 --- a/source/net/yacy/crawler/data/Latency.java +++ b/source/net/yacy/crawler/data/Latency.java @@ -29,11 +29,11 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxtEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.MemoryControl; @@ -50,7 +50,7 @@ public class Latency { * @param url * @param robotsCrawlDelay the crawl-delay given by the robots; 0 if not exist */ - public static void updateAfterSelection(final DigestURI url, final long robotsCrawlDelay) { + public static void updateAfterSelection(final DigestURL url, final long robotsCrawlDelay) { final String host = url.getHost(); if (host == null) return; String hosthash = url.hosthash(); @@ -67,7 +67,7 @@ public class Latency { * @param url * @param time the time to load the file in milliseconds */ - public static void updateBeforeLoad(final DigestURI url) { + public static void updateBeforeLoad(final DigestURL url) { final String host = url.getHost(); if (host == null) return; String hosthash = url.hosthash(); @@ -86,7 +86,7 @@ public class Latency { * @param url * @param time the time to load the file in milliseconds */ - public static void updateAfterLoad(final DigestURI url, final long time) { + public static void updateAfterLoad(final DigestURL url, final long time) { final String host = url.getHost(); if (host == null) return; String hosthash = url.hosthash(); @@ -100,7 +100,7 @@ public class Latency { } } - private static Host host(final DigestURI url) { + private static Host host(final DigestURL url) { final String host = url.getHost(); if (host == null) return null; return map.get(url.hosthash()); @@ -119,7 +119,7 @@ public class Latency { * @param thisAgents * @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights */ - public static int waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final ClientIdentification.Agent agent) { + public static int waitingRobots(final MultiProtocolURL url, final RobotsTxt robots, final ClientIdentification.Agent agent) { int robotsDelay = 0; RobotsTxtEntry robotsEntry = robots.getEntry(url, agent); robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis(); @@ -187,7 +187,7 @@ public class Latency { * @param agent * @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time */ - public static int waitingRemaining(final DigestURI url, final RobotsTxt robots, final ClientIdentification.Agent agent) { + public static int waitingRemaining(final DigestURL url, final RobotsTxt robots, final ClientIdentification.Agent agent) { // first check if the domain was _ever_ accessed before final Host host = host(url); @@ -200,7 +200,7 @@ public class Latency { // for CGI accesses, we double the minimum time // mostly there is a database access in the background // which creates a lot of unwanted IO on target site - if (MultiProtocolURI.isCGI(url.getFileName())) waiting = waiting * 2; + if (MultiProtocolURL.isCGI(url.getFileName())) waiting = waiting * 2; // if we have accessed the domain many times, get slower (the flux factor) if (!local) waiting += host.flux(waiting); @@ -219,7 +219,7 @@ public class Latency { return Math.min(60000, waiting) - timeSinceLastAccess; } - public static String waitingRemainingExplain(final DigestURI url, final RobotsTxt robots, final ClientIdentification.Agent agent) { + public static String waitingRemainingExplain(final DigestURL url, final RobotsTxt robots, final ClientIdentification.Agent agent) { // first check if the domain was _ever_ accessed before final Host host = host(url); @@ -234,7 +234,7 @@ public class Latency { // for CGI accesses, we double the minimum time // mostly there is a database access in the background // which creates a lot of unwanted IO on target site - if (MultiProtocolURI.isCGI(url.getFileName())) { waiting = waiting * 2; s.append(", isCGI = true -> double"); } + if (MultiProtocolURL.isCGI(url.getFileName())) { waiting = waiting * 2; s.append(", isCGI = true -> double"); } // if we have accessed the domain many times, get slower (the flux factor) int flux = host.flux(waiting); diff --git a/source/net/yacy/crawler/data/ResultImages.java b/source/net/yacy/crawler/data/ResultImages.java index 1d9fdab5a..adeecd40c 100644 --- a/source/net/yacy/crawler/data/ResultImages.java +++ b/source/net/yacy/crawler/data/ResultImages.java @@ -31,11 +31,11 @@ import java.util.Queue; import java.util.Set; import java.util.concurrent.LinkedBlockingQueue; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.storage.SizeLimitedSet; import net.yacy.document.Document; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.MemoryControl; @@ -54,14 +54,14 @@ public class ResultImages { // the same images may be linked from different pages private static final Set doubleCheck = new SizeLimitedSet(10000); - public static void registerImages(final DigestURI source, final Document document, final boolean privateEntry) { + public static void registerImages(final DigestURL source, final Document document, final boolean privateEntry) { if (document == null) return; if (source == null) return; if (MemoryControl.shortStatus()) clearQueues(); limitQueues(1000); - final Map images = document.getImages(); + final Map images = document.getImages(); for (final ImageEntry image: images.values()) { // do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup if (image == null || image.url() == null) continue; @@ -74,7 +74,7 @@ public class ResultImages { image.height() > 100 && image.width() < 1200 && image.height() < 1000 && - !"gif".equals(MultiProtocolURI.getFileExtension(image.url().getFileName()))) { + !"gif".equals(MultiProtocolURL.getFileExtension(image.url().getFileName()))) { // && ((urlString.lastIndexOf(".jpg") != -1)) || // ((urlString.lastIndexOf(".png") != -1)){ @@ -158,8 +158,8 @@ public class ResultImages { public static class OriginEntry { public ImageEntry imageEntry; - public MultiProtocolURI baseURL; - public OriginEntry(final ImageEntry imageEntry, final MultiProtocolURI baseURL) { + public MultiProtocolURL baseURL; + public OriginEntry(final ImageEntry imageEntry, final MultiProtocolURL baseURL) { this.imageEntry = imageEntry; this.baseURL = baseURL; } diff --git a/source/net/yacy/crawler/data/ZURL.java b/source/net/yacy/crawler/data/ZURL.java index 226907325..af0c47a4c 100644 --- a/source/net/yacy/crawler/data/ZURL.java +++ b/source/net/yacy/crawler/data/ZURL.java @@ -37,14 +37,14 @@ import java.util.concurrent.LinkedBlockingQueue; import org.apache.solr.common.SolrInputDocument; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.FailType; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.retrieval.Request; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.Index; import net.yacy.kelondro.index.Row; @@ -211,7 +211,7 @@ public class ZURL implements Iterable { public ArrayList list(int max) { final ArrayList l = new ArrayList(); - DigestURI url; + DigestURL url; for (final ZURL.Entry entry: this) { if (entry == null) continue; url = entry.url(); @@ -330,7 +330,7 @@ public class ZURL implements Iterable { return; } - public DigestURI url() { + public DigestURL url() { return this.bentry.url(); } diff --git a/source/net/yacy/crawler/retrieval/FTPLoader.java b/source/net/yacy/crawler/retrieval/FTPLoader.java index 080117d01..148853636 100644 --- a/source/net/yacy/crawler/retrieval/FTPLoader.java +++ b/source/net/yacy/crawler/retrieval/FTPLoader.java @@ -32,9 +32,10 @@ import java.io.IOException; import java.io.PrintStream; import java.util.Date; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; @@ -44,7 +45,6 @@ import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; public class FTPLoader { @@ -72,7 +72,7 @@ public class FTPLoader { Latency.updateBeforeLoad(request.url()); final long start = System.currentTimeMillis(); - final DigestURI entryUrl = request.url(); + final DigestURL entryUrl = request.url(); final String fullPath = getPath(entryUrl); // the return value @@ -119,7 +119,7 @@ public class FTPLoader { // directory -> get list of files final RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) { - final DigestURI u = this.sb.getURL(request.referrerhash()); + final DigestURL u = this.sb.getURL(request.referrerhash()); if (u != null) requestHeader.put(RequestHeader.REFERER, u.toNormalform(true)); } @@ -176,7 +176,7 @@ public class FTPLoader { /** * establish a connection to the ftp server (open, login, set transfer mode) */ - private boolean openConnection(final FTPClient ftpClient, final DigestURI entryUrl) { + private boolean openConnection(final FTPClient ftpClient, final DigestURL entryUrl) { // get username and password final String userInfo = entryUrl.getUserInfo(); String userName = "anonymous", userPwd = "anonymous"; @@ -215,7 +215,7 @@ public class FTPLoader { private Response getFile(final FTPClient ftpClient, final Request request, final boolean acceptOnlyParseable) throws IOException { // determine the mimetype of the resource - final DigestURI url = request.url(); + final DigestURL url = request.url(); final String mime = TextParser.mimeOf(url); final String path = getPath(url); @@ -225,7 +225,7 @@ public class FTPLoader { // create response header final RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) { - final DigestURI refurl = this.sb.getURL(request.referrerhash()); + final DigestURL refurl = this.sb.getURL(request.referrerhash()); if (refurl != null) requestHeader.put(RequestHeader.REFERER, refurl.toNormalform(true)); } final ResponseHeader responseHeader = new ResponseHeader(200); @@ -281,8 +281,8 @@ public class FTPLoader { * @param entryUrl * @return */ - private String getPath(final MultiProtocolURI entryUrl) { - return MultiProtocolURI.unescape(entryUrl.getPath()).replace("\"", "\"\""); + private String getPath(final MultiProtocolURL entryUrl) { + return MultiProtocolURL.unescape(entryUrl.getPath()).replace("\"", "\"\""); } } diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java index f4aeb4576..06d8bde3c 100644 --- a/source/net/yacy/crawler/retrieval/FileLoader.java +++ b/source/net/yacy/crawler/retrieval/FileLoader.java @@ -30,10 +30,11 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -42,7 +43,6 @@ import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; @@ -59,12 +59,12 @@ public class FileLoader { } public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { - DigestURI url = request.url(); + DigestURL url = request.url(); if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol()); RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) { - DigestURI ur = this.sb.getURL(request.referrerhash()); + DigestURL ur = this.sb.getURL(request.referrerhash()); if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true)); } @@ -96,7 +96,7 @@ public class FileLoader { } // create response header - String mime = Classification.ext2mime(MultiProtocolURI.getFileExtension(url.getFileName())); + String mime = Classification.ext2mime(MultiProtocolURL.getFileExtension(url.getFileName())); ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 248a9e8d5..2383cc128 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -27,7 +27,8 @@ package net.yacy.crawler.retrieval; import java.io.IOException; import java.util.Date; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -37,7 +38,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.crawler.data.ZURL.FailCategory; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.ByteCount; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; @@ -86,7 +86,7 @@ public final class HTTPLoader { throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted."); } - DigestURI url = request.url(); + DigestURL url = request.url(); final String host = url.getHost(); if (host == null || host.length() < 2) throw new IOException("host is not well-formed: '" + host + "'"); @@ -107,7 +107,7 @@ public final class HTTPLoader { if(yacyResolver != null) { final String yAddress = yacyResolver.resolve(host); if(yAddress != null) { - url = new DigestURI(url.getProtocol() + "://" + yAddress + path); + url = new DigestURL(url.getProtocol() + "://" + yAddress + path); } } @@ -117,7 +117,7 @@ public final class HTTPLoader { // create a request header final RequestHeader requestHeader = new RequestHeader(); requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent); - DigestURI refererURL = null; + DigestURL refererURL = null; if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash()); if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true)); requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT)); @@ -150,7 +150,7 @@ public final class HTTPLoader { } // normalize URL - final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString); + final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); // restart crawling with new url this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString); @@ -283,7 +283,7 @@ public final class HTTPLoader { } // normalizing URL - final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString); + final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); // if we are already doing a shutdown we don't need to retry crawling diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java index 80dffa45a..039483386 100644 --- a/source/net/yacy/crawler/retrieval/RSSLoader.java +++ b/source/net/yacy/crawler/retrieval/RSSLoader.java @@ -33,11 +33,12 @@ import java.util.List; import java.util.Map; import java.util.regex.Pattern; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.RSSMessage; -import net.yacy.cora.document.RSSReader; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.feed.RSSReader; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; @@ -48,7 +49,6 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.HarvestProcess; import net.yacy.data.WorkTables; import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; @@ -57,12 +57,12 @@ public class RSSLoader extends Thread { public static final ARC indexTriggered = new ComparableARC(1000, Base64Order.enhancedCoder); - private final DigestURI urlf; + private final DigestURL urlf; private final Switchboard sb; private final Map collections; private final ClientIdentification.Agent agent; - public RSSLoader(final Switchboard sb, final DigestURI urlf, final Map collections, final ClientIdentification.Agent agent) { + public RSSLoader(final Switchboard sb, final DigestURL urlf, final Map collections, final ClientIdentification.Agent agent) { this.sb = sb; this.urlf = urlf; this.collections = collections; @@ -94,13 +94,13 @@ public class RSSLoader extends Thread { recordAPI(this.sb, null, this.urlf, feed, 7, "seldays"); } - public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed, Map collections) { + public static void indexAllRssFeed(final Switchboard sb, final DigestURL url, final RSSFeed feed, Map collections) { int loadCount = 0; - List list = new ArrayList(); - Map urlmap = new HashMap(); + List list = new ArrayList(); + Map urlmap = new HashMap(); for (final RSSMessage message: feed) { try { - final DigestURI messageurl = new DigestURI(message.getLink()); + final DigestURL messageurl = new DigestURL(message.getLink()); if (indexTriggered.containsKey(messageurl.hash())) continue; urlmap.put(ASCII.String(messageurl.hash()), messageurl); } catch (final IOException e) { @@ -108,7 +108,7 @@ public class RSSLoader extends Thread { } } Map existingids = sb.urlExists(urlmap.keySet()); - for (final Map.Entry e: urlmap.entrySet()) { + for (final Map.Entry e: urlmap.entrySet()) { if (existingids.get(e.getKey()) != null) continue; list.add(e.getValue()); indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date()); @@ -141,7 +141,7 @@ public class RSSLoader extends Thread { } - public static void recordAPI(final Switchboard sb, final String apicall_pk, final DigestURI url, final RSSFeed feed, final int repeat_time, final String repeat_unit) { + public static void recordAPI(final Switchboard sb, final String apicall_pk, final DigestURL url, final RSSFeed feed, final int repeat_time, final String repeat_unit) { // record API action byte[] pk = null; final serverObjects post = new serverObjects(); diff --git a/source/net/yacy/crawler/retrieval/Request.java b/source/net/yacy/crawler/retrieval/Request.java index de8cfd5bb..227a7f0d9 100644 --- a/source/net/yacy/crawler/retrieval/Request.java +++ b/source/net/yacy/crawler/retrieval/Request.java @@ -29,11 +29,11 @@ package net.yacy.crawler.retrieval; import java.io.IOException; import java.util.Date; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.NaturalOrder; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.util.Bitfield; @@ -84,7 +84,7 @@ public class Request extends WorkflowJob private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy; // if this is generated by a crawl, the own peer hash in entered private byte[] refhash; // the url's referrer hash - private DigestURI url; // the url as string + private DigestURL url; // the url as string private String name; // the name of the url, from anchor tag name private long appdate; // the time when the url was first time appeared. private String profileHandle; // the name of the fetch profile @@ -102,7 +102,7 @@ public class Request extends WorkflowJob * @param url * @param referrerhash */ - public Request(final DigestURI url, final byte[] referrerhash) { + public Request(final DigestURL url, final byte[] referrerhash) { this(null, url, referrerhash, null, null, null, 0, 0, 0, 0); } @@ -121,7 +121,7 @@ public class Request extends WorkflowJob */ public Request( final byte[] initiator, - final DigestURI url, + final DigestURL url, final byte[] referrerhash, final String name, final Date appdate, @@ -166,7 +166,7 @@ public class Request extends WorkflowJob this.initiator = entry.getColBytes(1, true); this.initiator = (this.initiator == null) ? null : ((this.initiator.length == 0) ? null : this.initiator); - this.url = new DigestURI(urlstring, entry.getPrimaryKeyBytes()); + this.url = new DigestURL(urlstring, entry.getPrimaryKeyBytes()); this.refhash = (entry.empty(3)) ? null : entry.getColBytes(3, true); this.name = (entry.empty(4)) ? "" : entry.getColUTF8(4).trim(); this.appdate = entry.getColLong(5); @@ -230,12 +230,12 @@ public class Request extends WorkflowJob return rowdef.newEntry(entry); } - public DigestURI url() { + public DigestURL url() { // the url return this.url; } - public void redirectURL(final DigestURI redirectedURL) { + public void redirectURL(final DigestURL redirectedURL) { // replace old URL by new one. This should only be used in case of url redirection this.url = redirectedURL; } diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 0d19bb763..717a65324 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -29,10 +29,11 @@ package net.yacy.crawler.retrieval; import java.util.Date; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; @@ -42,7 +43,6 @@ import net.yacy.crawler.data.ResultURLs.EventOrigin; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; public class Response { @@ -69,8 +69,8 @@ public class Response { private final boolean fromCache; // doctype calculation - public static char docType(final MultiProtocolURI url) { - String ext = MultiProtocolURI.getFileExtension(url.getFileName()); + public static char docType(final MultiProtocolURL url) { + String ext = MultiProtocolURL.getFileExtension(url.getFileName()); if (ext == null) return DT_UNKNOWN; if (ext.equals(".gif")) return DT_IMAGE; if (ext.equals(".ico")) return DT_IMAGE; @@ -171,7 +171,7 @@ public class Response { // request and response headers may be zero in case that we process surrogates this.requestHeader = new RequestHeader(); this.responseHeader = new ResponseHeader(200); - this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(MultiProtocolURI.getFileExtension(request.url().getFileName()), "text/plain")); // tell parser how to handle the content + this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(MultiProtocolURL.getFileExtension(request.url().getFileName()), "text/plain")); // tell parser how to handle the content if (!request.isEmpty()) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size())); this.profile = profile; this.status = QUEUE_STATE_FRESH; @@ -210,7 +210,7 @@ public class Response { return this.request.name(); } - public DigestURI url() { + public DigestURL url() { return this.request.url(); } @@ -293,7 +293,7 @@ public class Response { return "dynamic_post"; } - if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) { + if (MultiProtocolURL.isCGI(MultiProtocolURL.getFileExtension(url().getFileName()))) { return "dynamic_cgi"; } @@ -392,7 +392,7 @@ public class Response { if (url().isPOST()) { return false; } - if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) { + if (MultiProtocolURL.isCGI(MultiProtocolURL.getFileExtension(url().getFileName()))) { return false; } @@ -543,7 +543,7 @@ public class Response { if (url().isPOST()) { return "Dynamic_(POST)"; } - if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) { + if (MultiProtocolURL.isCGI(MultiProtocolURL.getFileExtension(url().getFileName()))) { return "Dynamic_(CGI)"; } } @@ -686,7 +686,7 @@ public class Response { // CGI access makes the page very individual, and therefore not usable in caches if (!profile().crawlingQ()) { if (url().isPOST()) { return "Dynamic_(POST)"; } - if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) { return "Dynamic_(CGI)"; } + if (MultiProtocolURL.isCGI(MultiProtocolURL.getFileExtension(url().getFileName()))) { return "Dynamic_(CGI)"; } } // -authorization cases in request @@ -759,12 +759,12 @@ public class Response { return this.responseHeader.getCharacterEncoding(); } - public DigestURI referrerURL() { + public DigestURL referrerURL() { if (this.requestHeader == null) return null; try { final String r = this.requestHeader.get(RequestHeader.REFERER, null); if (r == null) return null; - return new DigestURI(r); + return new DigestURL(r); } catch (final Exception e) { return null; } @@ -775,7 +775,7 @@ public class Response { final String u = this.requestHeader.get(RequestHeader.REFERER, ""); if (u == null || u.isEmpty()) return null; try { - return new DigestURI(u).hash(); + return new DigestURL(u).hash(); } catch (final Exception e) { return null; } diff --git a/source/net/yacy/crawler/retrieval/SMBLoader.java b/source/net/yacy/crawler/retrieval/SMBLoader.java index f8b87d4f1..56c2adca1 100644 --- a/source/net/yacy/crawler/retrieval/SMBLoader.java +++ b/source/net/yacy/crawler/retrieval/SMBLoader.java @@ -38,10 +38,11 @@ import java.util.List; import jcifs.smb.SmbException; import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -50,7 +51,6 @@ import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; @@ -70,12 +70,12 @@ public class SMBLoader { public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { - DigestURI url = request.url(); + DigestURL url = request.url(); if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol()); RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) { - DigestURI ur = this.sb.getURL(request.referrerhash()); + DigestURL ur = this.sb.getURL(request.referrerhash()); if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true)); } @@ -87,7 +87,7 @@ public class SMBLoader { List list = new ArrayList(); for (String s: l) { if (s.startsWith(".")) continue; - s = MultiProtocolURI.escape(s).toString(); + s = MultiProtocolURL.escape(s).toString(); if (!s.endsWith("/") && !s.endsWith("\\")) { // check if this is a directory SmbFile sf = new SmbFile(u + s); @@ -114,7 +114,7 @@ public class SMBLoader { } // create response header - String mime = Classification.ext2mime(MultiProtocolURI.getFileExtension(url.getFileName())); + String mime = Classification.ext2mime(MultiProtocolURL.getFileExtension(url.getFileName())); ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index e6a7b88a1..55ea340d3 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -28,13 +28,13 @@ package net.yacy.crawler.retrieval; import java.net.MalformedURLException; import java.util.Date; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.CrawlProfile; import net.yacy.document.parser.sitemapParser; import net.yacy.document.parser.sitemapParser.URLEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.Switchboard; @@ -42,10 +42,10 @@ public class SitemapImporter extends Thread { private CrawlProfile crawlingProfile = null; private static final ConcurrentLog logger = new ConcurrentLog("SITEMAP"); - private DigestURI siteMapURL = null; + private DigestURL siteMapURL = null; private final Switchboard sb; - public SitemapImporter(final Switchboard sb, final DigestURI sitemapURL, final CrawlProfile profileEntry) { + public SitemapImporter(final Switchboard sb, final DigestURL sitemapURL, final CrawlProfile profileEntry) { assert sitemapURL != null; this.sb = sb; this.siteMapURL = sitemapURL; @@ -72,9 +72,9 @@ public class SitemapImporter extends Thread { // get the url hash byte[] nexturlhash = null; - DigestURI url = null; + DigestURL url = null; try { - url = new DigestURI(entry.url()); + url = new DigestURL(entry.url()); nexturlhash = url.hash(); } catch (final MalformedURLException e1) { } diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index 61074d89a..a109bf505 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -35,7 +35,8 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.regex.Pattern; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; @@ -45,7 +46,6 @@ import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.data.WorkTables; import net.yacy.kelondro.blob.BEncodedHeap; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.LoaderDispatcher; public class RobotsTxt { @@ -89,7 +89,7 @@ public class RobotsTxt { return this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).size(); } - public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final ClientIdentification.Agent agent) { + public RobotsTxtEntry getEntry(final MultiProtocolURL theURL, final ClientIdentification.Agent agent) { if (theURL == null) throw new IllegalArgumentException(); if (!theURL.getProtocol().startsWith("http")) return null; return getEntry(getHostPort(theURL), agent, true); @@ -151,9 +151,9 @@ public class RobotsTxt { } // generating the proper url to download the robots txt - DigestURI robotsURL = null; + DigestURL robotsURL = null; try { - robotsURL = new DigestURI((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); + robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); } catch (final MalformedURLException e) { log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); robotsURL = null; @@ -182,7 +182,7 @@ public class RobotsTxt { return robotsTxt4Host; } - public void ensureExist(final MultiProtocolURI theURL, final ClientIdentification.Agent agent, boolean concurrent) { + public void ensureExist(final MultiProtocolURL theURL, final ClientIdentification.Agent agent, boolean concurrent) { if (theURL.isLocal()) return; final String urlHostPort = getHostPort(theURL); if (urlHostPort == null) return; @@ -207,9 +207,9 @@ public class RobotsTxt { if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return; // generating the proper url to download the robots txt - DigestURI robotsURL = null; + DigestURL robotsURL = null; try { - robotsURL = new DigestURI((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); + robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); } catch (final MalformedURLException e) { log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); robotsURL = null; @@ -237,7 +237,7 @@ public class RobotsTxt { if (concurrent) t.start(); else t.run(); } - private void processOldEntry(RobotsTxtEntry robotsTxt4Host, DigestURI robotsURL, BEncodedHeap robotsTable) { + private void processOldEntry(RobotsTxtEntry robotsTxt4Host, DigestURL robotsURL, BEncodedHeap robotsTable) { // no robots.txt available, make an entry to prevent that the robots loading is done twice if (robotsTxt4Host == null) { // generate artificial entry @@ -265,7 +265,7 @@ public class RobotsTxt { } } - private void processNewEntry(DigestURI robotsURL, Response response, final String[] thisAgents) { + private void processNewEntry(DigestURL robotsURL, Response response, final String[] thisAgents) { final byte[] robotsTxt = response.getContent(); //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove RobotsTxtParser parserResult; @@ -309,7 +309,7 @@ public class RobotsTxt { } } - static final String getHostPort(final MultiProtocolURI theURL) { + static final String getHostPort(final MultiProtocolURL theURL) { int port = theURL.getPort(); if (port == -1) { if (theURL.getProtocol().equalsIgnoreCase("http")) { diff --git a/source/net/yacy/crawler/robots/RobotsTxtEntry.java b/source/net/yacy/crawler/robots/RobotsTxtEntry.java index 8bc3fc404..293f9b127 100644 --- a/source/net/yacy/crawler/robots/RobotsTxtEntry.java +++ b/source/net/yacy/crawler/robots/RobotsTxtEntry.java @@ -36,9 +36,9 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.ByteArray; @@ -94,7 +94,7 @@ public class RobotsTxtEntry { } protected RobotsTxtEntry( - final MultiProtocolURI theURL, + final MultiProtocolURL theURL, final List allowPathList, final List disallowPathList, final Date loadedDate, @@ -177,11 +177,11 @@ public class RobotsTxtEntry { * get the sitemap url * @return the sitemap url or null if no sitemap url is given */ - public MultiProtocolURI getSitemap() { + public MultiProtocolURL getSitemap() { final String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null; if (url == null) return null; try { - return new MultiProtocolURI(url); + return new MultiProtocolURL(url); } catch (final MalformedURLException e) { return null; } @@ -228,7 +228,7 @@ public class RobotsTxtEntry { return 0; } - public boolean isDisallowed(final MultiProtocolURI subpathURL) { + public boolean isDisallowed(final MultiProtocolURL subpathURL) { String path = subpathURL.getFile(); if (this.mem == null) { this.info = "no robots file available"; diff --git a/source/net/yacy/crawler/robots/RobotsTxtParser.java b/source/net/yacy/crawler/robots/RobotsTxtParser.java index dc589ea2e..a5e54939e 100644 --- a/source/net/yacy/crawler/robots/RobotsTxtParser.java +++ b/source/net/yacy/crawler/robots/RobotsTxtParser.java @@ -38,7 +38,7 @@ import java.io.InputStreamReader; import java.util.ArrayList; import java.util.regex.Pattern; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; /* * A class for Parsing robots.txt files. diff --git a/source/net/yacy/data/BlogBoard.java b/source/net/yacy/data/BlogBoard.java index 303081776..c1ec79f15 100644 --- a/source/net/yacy/data/BlogBoard.java +++ b/source/net/yacy/data/BlogBoard.java @@ -48,7 +48,7 @@ import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.protocol.Domains; diff --git a/source/net/yacy/data/BlogBoardComments.java b/source/net/yacy/data/BlogBoardComments.java index 719e0cf38..9c1e03ba9 100644 --- a/source/net/yacy/data/BlogBoardComments.java +++ b/source/net/yacy/data/BlogBoardComments.java @@ -45,7 +45,7 @@ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.protocol.Domains; diff --git a/source/net/yacy/data/BookmarkDate.java b/source/net/yacy/data/BookmarkDate.java index 15d9ea6cd..4c6df65d4 100644 --- a/source/net/yacy/data/BookmarkDate.java +++ b/source/net/yacy/data/BookmarkDate.java @@ -37,8 +37,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; - -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java index ecdc33cdb..2d9dfd91f 100644 --- a/source/net/yacy/data/BookmarkHelper.java +++ b/source/net/yacy/data/BookmarkHelper.java @@ -32,13 +32,11 @@ import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.MalformedURLException; import java.text.ParseException; +import java.util.ArrayList; +import java.util.Collection; import java.util.Date; -import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Properties; import java.util.Set; import java.util.TreeSet; @@ -47,14 +45,15 @@ import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.BookmarksDB.Bookmark; import net.yacy.data.BookmarksDB.Tag; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.util.FileUtils; @@ -116,7 +115,7 @@ public class BookmarkHelper { // bookmarksDB's Import/Export functions // -------------------------------------- - public static int importFromBookmarks(final BookmarksDB db, final DigestURI baseURL, final String input, final String tag, final boolean importPublic){ + public static int importFromBookmarks(final BookmarksDB db, final DigestURL baseURL, final String input, final String tag, final boolean importPublic){ try { // convert string to input stream final ByteArrayInputStream byteIn = new ByteArrayInputStream(UTF8.getBytes(input)); @@ -129,13 +128,12 @@ public class BookmarkHelper { } } - private static int importFromBookmarks(final BookmarksDB db, final DigestURI baseURL, final InputStreamReader input, final String tag, final boolean importPublic){ + private static int importFromBookmarks(final BookmarksDB db, final DigestURL baseURL, final InputStreamReader input, final String tag, final boolean importPublic){ int importCount = 0; - Map links = new HashMap(); + Collection links = new ArrayList(); String title; - DigestURI url; Bookmark bm; final Set tags=ListManager.string2set(tag); //this allow multiple default tags try { @@ -147,9 +145,8 @@ public class BookmarkHelper { writer.close(); links = scraper.getAnchors(); } catch (final IOException e) { ConcurrentLog.warn("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());} - for (final Entry link: links.entrySet()) { - url = link.getKey(); - title = link.getValue().getProperty("name", ""); + for (final DigestURL url: links) { + title = url.getProperties().getProperty("name", ""); ConcurrentLog.info("BOOKMARKS", "links.get(url)"); if ("".equals(title)) {//cannot be displayed title = url.toString(); diff --git a/source/net/yacy/data/BookmarksDB.java b/source/net/yacy/data/BookmarksDB.java index c078d57aa..1c11b4b15 100644 --- a/source/net/yacy/data/BookmarksDB.java +++ b/source/net/yacy/data/BookmarksDB.java @@ -36,12 +36,12 @@ import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.blob.MapHeap; -import net.yacy.kelondro.data.meta.DigestURI; public class BookmarksDB { @@ -502,7 +502,7 @@ public class BookmarksDB { loadTimestamp(); } - public Bookmark(final DigestURI url) { + public Bookmark(final DigestURL url) { this.entry = new HashMap(); this.urlHash = ASCII.String(url.hash()); this.entry.put(BOOKMARK_URL, url.toNormalform(false)); @@ -526,11 +526,11 @@ public class BookmarksDB { } public Bookmark(final String url) throws MalformedURLException { - this(new DigestURI((url.indexOf("://") < 0) ? "http://" + url : url)); + this(new DigestURL((url.indexOf("://") < 0) ? "http://" + url : url)); } public Bookmark(final Map map) throws MalformedURLException { - this(ASCII.String((new DigestURI(map.get(BOOKMARK_URL))).hash()), map); + this(ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash()), map); } Map toMap() { diff --git a/source/net/yacy/data/MessageBoard.java b/source/net/yacy/data/MessageBoard.java index 614e0ca06..b7eab1cff 100644 --- a/source/net/yacy/data/MessageBoard.java +++ b/source/net/yacy/data/MessageBoard.java @@ -32,7 +32,7 @@ import java.util.Locale; import java.util.Map; import java.util.TimeZone; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; diff --git a/source/net/yacy/data/URLLicense.java b/source/net/yacy/data/URLLicense.java index 63153fabe..2f0f2cded 100644 --- a/source/net/yacy/data/URLLicense.java +++ b/source/net/yacy/data/URLLicense.java @@ -29,9 +29,9 @@ package net.yacy.data; import java.util.Collections; import java.util.Map; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.storage.SizeLimitedMap; -import net.yacy.kelondro.data.meta.DigestURI; public class URLLicense { @@ -41,7 +41,7 @@ public class URLLicense { private static final int maxQueue = 10000; private static final Map permissions = Collections.synchronizedMap(new SizeLimitedMap(maxQueue)); - public static String aquireLicense(final DigestURI url) { + public static String aquireLicense(final DigestURL url) { if (url == null) return ""; // generate license key String license = ASCII.String(url.hash()); diff --git a/source/net/yacy/data/UserDB.java b/source/net/yacy/data/UserDB.java index 584acddf5..8659e0805 100644 --- a/source/net/yacy/data/UserDB.java +++ b/source/net/yacy/data/UserDB.java @@ -36,7 +36,7 @@ import java.util.Iterator; import java.util.Map; import java.util.Random; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.Digest; diff --git a/source/net/yacy/data/WorkTables.java b/source/net/yacy/data/WorkTables.java index d237ac3fd..0c3692393 100644 --- a/source/net/yacy/data/WorkTables.java +++ b/source/net/yacy/data/WorkTables.java @@ -39,8 +39,9 @@ import java.util.Map; import java.util.TreeMap; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.http.HTTPClient; @@ -49,7 +50,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ymark.YMarkTables; import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.rwi.IndexCell; import net.yacy.search.Switchboard; @@ -323,7 +323,7 @@ public class WorkTables extends Tables { row.put(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC, new Date(d)); } - public void failURLsRegisterMissingWord(IndexCell indexCell, final DigestURI url, HandleSet queryHashes, final String reason) { + public void failURLsRegisterMissingWord(IndexCell indexCell, final DigestURL url, HandleSet queryHashes, final String reason) { // remove words from index if (indexCell != null) { diff --git a/source/net/yacy/data/wiki/WikiBoard.java b/source/net/yacy/data/wiki/WikiBoard.java index 4d2f16e71..37998529a 100644 --- a/source/net/yacy/data/wiki/WikiBoard.java +++ b/source/net/yacy/data/wiki/WikiBoard.java @@ -35,8 +35,8 @@ import java.util.Locale; import java.util.Map; import java.util.TimeZone; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.protocol.Domains; diff --git a/source/net/yacy/data/ymark/TablesRowComparator.java b/source/net/yacy/data/ymark/TablesRowComparator.java index 0d51e07ea..5bae5a61d 100644 --- a/source/net/yacy/data/ymark/TablesRowComparator.java +++ b/source/net/yacy/data/ymark/TablesRowComparator.java @@ -2,7 +2,7 @@ package net.yacy.data.ymark; import java.util.Comparator; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.kelondro.blob.Tables; public class TablesRowComparator implements Comparator { diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index 2d6bd1bed..2b35a08fa 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -10,7 +10,8 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ArrayBlockingQueue; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.ConcurrentLog; @@ -21,7 +22,6 @@ import net.yacy.document.LibraryProvider; import net.yacy.document.Parser.Failure; import net.yacy.document.SentenceReader; import net.yacy.document.WordTokenizer; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.repository.LoaderDispatcher; @@ -60,10 +60,10 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } private static Document loadDocument(final String url, final LoaderDispatcher loader, ClientIdentification.Agent agent) throws IOException { - DigestURI uri; + DigestURL uri; Response response; try { - uri = new DigestURI(url); + uri = new DigestURL(url); } catch (final MalformedURLException e) { ConcurrentLog.warn(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to malformed url: "+url); return null; @@ -161,7 +161,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } final String clean = YMarkUtil.cleanTagsString(buffer.toString()); if(clean.equals(YMarkEntry.BOOKMARK.TAGS.deflt())) { - return MultiProtocolURI.getFileExtension(document.dc_source().getFileName()); + return MultiProtocolURL.getFileExtension(document.dc_source().getFileName()); } return clean; } finally { diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index d545e6f6c..1eea74ea9 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -32,7 +32,8 @@ import java.util.HashMap; import java.util.Iterator; import java.util.regex.Pattern; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.CrawlSwitchboard; @@ -40,7 +41,6 @@ import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.retrieval.Request; import net.yacy.data.WorkTables; import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; public class YMarkCrawlStart extends HashMap{ @@ -162,7 +162,7 @@ public class YMarkCrawlStart extends HashMap{ protected static String crawlStart( final Switchboard sb, - final DigestURI startURL, + final DigestURL startURL, final String urlMustMatch, final String urlMustNotMatch, final int depth, diff --git a/source/net/yacy/data/ymark/YMarkDate.java b/source/net/yacy/data/ymark/YMarkDate.java index 34d8f876a..a1ba23388 100644 --- a/source/net/yacy/data/ymark/YMarkDate.java +++ b/source/net/yacy/data/ymark/YMarkDate.java @@ -31,7 +31,7 @@ import java.text.SimpleDateFormat; import java.util.Date; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; public class YMarkDate { diff --git a/source/net/yacy/data/ymark/YMarkEntry.java b/source/net/yacy/data/ymark/YMarkEntry.java index 0ed54adc8..3b6162cb2 100644 --- a/source/net/yacy/data/ymark/YMarkEntry.java +++ b/source/net/yacy/data/ymark/YMarkEntry.java @@ -33,11 +33,11 @@ import java.util.HashMap; import java.util.Map; import java.util.TreeMap; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.document.content.DCEntry; import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; public class YMarkEntry extends TreeMap { @@ -237,7 +237,7 @@ public class YMarkEntry extends TreeMap { } public void crawl(final YMarkCrawlStart.CRAWLSTART type, final boolean medialink, final Switchboard sb) throws MalformedURLException { - final DigestURI url = new DigestURI(this.get(BOOKMARK.URL.key())); + final DigestURL url = new DigestURL(this.get(BOOKMARK.URL.key())); switch(type) { case SINGLE: YMarkCrawlStart.crawlStart(sb, url, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 0, true, medialink); diff --git a/source/net/yacy/data/ymark/YMarkMetadata.java b/source/net/yacy/data/ymark/YMarkMetadata.java index 5814f5c6e..3bc88dd7b 100644 --- a/source/net/yacy/data/ymark/YMarkMetadata.java +++ b/source/net/yacy/data/ymark/YMarkMetadata.java @@ -31,19 +31,19 @@ import java.net.MalformedURLException; import java.util.EnumMap; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.index.Segment; public class YMarkMetadata { - private DigestURI uri; + private DigestURL uri; Document document; Segment indexSegment; @@ -67,13 +67,13 @@ public class YMarkMetadata { AUTOTAG } - public YMarkMetadata(final DigestURI uri) { + public YMarkMetadata(final DigestURL uri) { this.uri = uri; this.document = null; this.indexSegment = null; } - public YMarkMetadata(final DigestURI uri, final Segment indexSegment) { + public YMarkMetadata(final DigestURL uri, final Segment indexSegment) { this.uri = uri; this.document = null; this.indexSegment = indexSegment; @@ -88,7 +88,7 @@ public class YMarkMetadata { public YMarkMetadata(final Document document) { this.document = document; try { - this.uri = new DigestURI(this.document.dc_identifier()); + this.uri = new DigestURL(this.document.dc_identifier()); } catch (final MalformedURLException e) { this.uri = null; } diff --git a/source/net/yacy/data/ymark/YMarkRDF.java b/source/net/yacy/data/ymark/YMarkRDF.java index 5e0796141..95b4371e4 100644 --- a/source/net/yacy/data/ymark/YMarkRDF.java +++ b/source/net/yacy/data/ymark/YMarkRDF.java @@ -6,7 +6,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.lod.vocabulary.AnnoteaA; import net.yacy.cora.lod.vocabulary.AnnoteaB; import net.yacy.cora.lod.vocabulary.DCElements; diff --git a/source/net/yacy/data/ymark/YMarkTables.java b/source/net/yacy/data/ymark/YMarkTables.java index 3811191c5..673c6ed7c 100644 --- a/source/net/yacy/data/ymark/YMarkTables.java +++ b/source/net/yacy/data/ymark/YMarkTables.java @@ -42,7 +42,8 @@ import java.util.concurrent.ConcurrentHashMap; import javax.swing.event.ChangeEvent; import javax.swing.event.ChangeListener; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; @@ -53,7 +54,6 @@ import net.yacy.kelondro.blob.TableColumnIndexException; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.blob.Tables.Row; import net.yacy.kelondro.blob.TablesColumnIndex; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.LoaderDispatcher; public class YMarkTables { @@ -364,10 +364,10 @@ public class YMarkTables { } public void createBookmark(final LoaderDispatcher loader, final String url, final ClientIdentification.Agent agent, final String bmk_user, final boolean autotag, final String tagsString, final String foldersString) throws IOException, Failure { - createBookmark(loader, new DigestURI(url), agent, bmk_user, autotag, tagsString, foldersString); + createBookmark(loader, new DigestURL(url), agent, bmk_user, autotag, tagsString, foldersString); } - public void createBookmark(final LoaderDispatcher loader, final DigestURI url, final ClientIdentification.Agent agent, final String bmk_user, final boolean autotag, final String tagsString, final String foldersString) throws IOException, Failure { + public void createBookmark(final LoaderDispatcher loader, final DigestURL url, final ClientIdentification.Agent agent, final String bmk_user, final boolean autotag, final String tagsString, final String foldersString) throws IOException, Failure { final YMarkEntry bmk_entry = new YMarkEntry(false); final YMarkMetadata meta = new YMarkMetadata(url); diff --git a/source/net/yacy/data/ymark/YMarkUtil.java b/source/net/yacy/data/ymark/YMarkUtil.java index 4ce277506..e9a19f7eb 100644 --- a/source/net/yacy/data/ymark/YMarkUtil.java +++ b/source/net/yacy/data/ymark/YMarkUtil.java @@ -30,8 +30,8 @@ import java.net.MalformedURLException; import java.util.HashSet; import java.util.Iterator; -import net.yacy.cora.document.UTF8; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.kelondro.data.word.Word; public class YMarkUtil { @@ -48,7 +48,7 @@ public class YMarkUtil { * @see net.yacy.kelondro.data.meta.DigestURI.DigestURI(String url, byte[] hash).hash() */ public final static byte[] getBookmarkId(String url) throws MalformedURLException { - return (new DigestURI(url)).hash(); + return (new DigestURL(url)).hash(); } /** diff --git a/source/net/yacy/dbtest.java b/source/net/yacy/dbtest.java index f6fb0ce72..de1f68c8b 100644 --- a/source/net/yacy/dbtest.java +++ b/source/net/yacy/dbtest.java @@ -11,7 +11,7 @@ import java.util.Random; import javax.imageio.ImageIO; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.NaturalOrder; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 86dde3561..bceda9b8c 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -39,18 +39,18 @@ import java.util.TreeMap; import org.apache.solr.common.params.MapSolrParams; -import net.yacy.cora.document.ASCII; import net.yacy.cora.document.WordCache; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.language.Identificator; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.util.Bitfield; @@ -113,7 +113,7 @@ public final class Condenser { // add the URL components to the word list insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); - Map.Entry entry; + Map.Entry entry; if (indexText) { createCondensement(document.getTextString(), meaningLib, doAutotagging); // the phrase counter: @@ -165,7 +165,7 @@ public final class Condenser { if (indexMedia) { // add anchor descriptions: here, we also add the url components // audio - Iterator> i = document.getAudiolinks().entrySet().iterator(); + Iterator> i = document.getAudiolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); @@ -191,7 +191,7 @@ public final class Condenser { // images final Iterator j = document.getImages().values().iterator(); ImageEntry ientry; - MultiProtocolURI url; + MultiProtocolURL url; while (j.hasNext()) { ientry = j.next(); url = ientry.url(); diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 6fada35c9..e8e06e51b 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -5,10 +5,6 @@ //first published on http://www.anomic.de //Frankfurt, Germany, 2005 // -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// //This program is free software; you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by //the Free Software Foundation; either version 2 of the License, or @@ -44,31 +40,32 @@ import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Set; import java.util.TreeSet; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Request; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; public class Document { - private final DigestURI source; // the source url + private final DigestURL source; // the source url private final String mimeType; // mimeType as taken from http header private final String charset; // the charset of the document private final List keywords; // most resources provide a keyword field @@ -78,16 +75,16 @@ public class Document { private final List sections; // if present: more titles/headlines appearing in the document private final List descriptions; // an abstract, if present: short content description private Object text; // the clear text, all that is visible - private final Map anchors; // all links embedded as clickeable entities (anchor tags) - private final Map rss; // all embedded rss feeds - private final Map images; // all visible pictures in document + private final Collection anchors; // all links embedded as clickeable entities (anchor tags) + private final LinkedHashMap rss; // all embedded rss feeds + private final LinkedHashMap images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. - private Map audiolinks, videolinks, applinks, hyperlinks; - private Map inboundlinks, outboundlinks; + private LinkedHashMap audiolinks, videolinks, applinks, hyperlinks; + private LinkedHashMap inboundlinks, outboundlinks; private Map emaillinks; - private MultiProtocolURI favicon; + private MultiProtocolURL favicon; private boolean resorted; private final Set languages; private final boolean indexingDenied; @@ -96,7 +93,7 @@ public class Document { private final Map> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document private final Date date; - public Document(final DigestURI location, final String mimeType, final String charset, + public Document(final DigestURL location, final String mimeType, final String charset, final Object parserObject, final Set languages, final String[] keywords, @@ -105,9 +102,9 @@ public class Document { final String[] sections, final List abstrcts, final double lon, final double lat, final Object text, - final Map anchors, - final Map rss, - final Map images, + final Collection anchors, + final LinkedHashMap rss, + final LinkedHashMap images, final boolean indexingDenied, final Date date) { this.source = location; @@ -129,9 +126,9 @@ public class Document { this.lon = 0.0d; this.lat = 0.0d; } - this.anchors = (anchors == null) ? new HashMap(0) : anchors; - this.rss = (rss == null) ? new HashMap(0) : rss; - this.images = (images == null) ? new HashMap() : images; + this.anchors = (anchors == null) ? new ArrayList(0) : anchors; + this.rss = (rss == null) ? new LinkedHashMap(0) : rss; + this.images = (images == null) ? new LinkedHashMap() : images; this.publisher = publisher; this.hyperlinks = null; this.audiolinks = null; @@ -308,7 +305,7 @@ dc_rights return this.source.toNormalform(true); } - public DigestURI dc_source() { + public DigestURL dc_source() { return this.source; } @@ -405,13 +402,13 @@ dc_rights return this.keywords; } - public Map getAnchors() { + public Collection getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return this.anchors; } - public Map getRSS() { + public Map getRSS() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return this.rss; @@ -420,30 +417,30 @@ dc_rights // the next three methods provide a calculated view on the getAnchors/getImages: - public Map getHyperlinks() { + public Map getHyperlinks() { // this is a subset of the getAnchor-set: only links to other hyperrefs if (!this.resorted) resortLinks(); return this.hyperlinks; } - public Map getAudiolinks() { + public Map getAudiolinks() { if (!this.resorted) resortLinks(); return this.audiolinks; } - public Map getVideolinks() { + public Map getVideolinks() { if (!this.resorted) resortLinks(); return this.videolinks; } - public Map getImages() { + public Map getImages() { // returns all links enbedded as pictures (visible in document) // this resturns a htmlFilterImageEntry collection if (!this.resorted) resortLinks(); return this.images; } - public Map getApplinks() { + public Map getApplinks() { if (!this.resorted) resortLinks(); return this.applinks; } @@ -471,27 +468,25 @@ dc_rights synchronized (this) { if (this.resorted) return; // extract hyperlinks, medialinks and emaillinks from anchorlinks - DigestURI url; String u; int extpos, qpos; String ext = null; final String thishost = this.source.getHost(); - this.inboundlinks = new HashMap(); - this.outboundlinks = new HashMap(); - this.hyperlinks = new HashMap(); - this.videolinks = new HashMap(); - this.audiolinks = new HashMap(); - this.applinks = new HashMap(); - this.emaillinks = new HashMap(); - final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks - for (final Map.Entry entry: collectedImages.entrySet()) { + this.inboundlinks = new LinkedHashMap(); + this.outboundlinks = new LinkedHashMap(); + this.hyperlinks = new LinkedHashMap(); + this.videolinks = new LinkedHashMap(); + this.audiolinks = new LinkedHashMap(); + this.applinks = new LinkedHashMap(); + this.emaillinks = new LinkedHashMap(); + final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks + for (final Map.Entry entry: collectedImages.entrySet()) { if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image"); } - for (final Map.Entry entry: this.anchors.entrySet()) { - url = entry.getKey(); + for (final DigestURL url: this.anchors) { if (url == null) continue; - final boolean noindex = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("noindex",0) >= 0; - final boolean nofollow = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("nofollow",0) >= 0; + final boolean noindex = url.getProperties().getProperty("rel", "").toLowerCase().indexOf("noindex",0) >= 0; + final boolean nofollow = url.getProperties().getProperty("rel", "").toLowerCase().indexOf("nofollow",0) >= 0; if ((thishost == null && url.getHost() == null) || ((thishost != null && url.getHost() != null) && (url.getHost().endsWith(thishost) || @@ -501,7 +496,7 @@ dc_rights this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : "")); } u = url.toNormalform(true); - final String name = entry.getValue().getProperty("name", ""); + final String name = url.getProperties().getProperty("name", ""); if (u.startsWith("mailto:")) { this.emaillinks.put(u.substring(7), name); } else { @@ -515,7 +510,7 @@ dc_rights if (Classification.isMediaExtension(ext)) { // this is not a normal anchor, its a media link if (Classification.isImageExtension(ext)) { - ContentScraper.addImage(collectedImages, new ImageEntry(url, name, -1, -1, -1)); + collectedImages.put(url, new ImageEntry(url, name, -1, -1, -1)); } else if (Classification.isAudioExtension(ext)) this.audiolinks.put(url, name); else if (Classification.isVideoExtension(ext)) this.videolinks.put(url, name); @@ -528,7 +523,7 @@ dc_rights } // add image links that we collected from the anchors to the image map - ContentScraper.addAllImages(this.images, collectedImages); + this.images.putAll(collectedImages); // expand the hyperlinks: // we add artificial hyperlinks to the hyperlink set @@ -550,21 +545,21 @@ dc_rights } } - public static Map allSubpaths(final Collection links) { + public static Map allSubpaths(final Collection links) { // links is either a Set of Strings (urls) or a Set of // htmlFilterImageEntries final Set h = new HashSet(); Iterator i = links.iterator(); Object o; - MultiProtocolURI url; + MultiProtocolURL url; String u; int pos; int l; while (i.hasNext()) try { o = i.next(); - if (o instanceof MultiProtocolURI) url = (MultiProtocolURI) o; - else if (o instanceof String) url = new MultiProtocolURI((String) o); + if (o instanceof MultiProtocolURL) url = (MultiProtocolURL) o; + else if (o instanceof String) url = new MultiProtocolURL((String) o); else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); else { assert false; @@ -585,11 +580,11 @@ dc_rights } catch (final MalformedURLException e) { } // now convert the strings to yacyURLs i = h.iterator(); - final Map v = new HashMap(); + final Map v = new HashMap(); while (i.hasNext()) { u = (String) i.next(); try { - url = new MultiProtocolURI(u); + url = new MultiProtocolURL(u); v.put(url, "sub"); } catch (final MalformedURLException e) { } @@ -597,23 +592,23 @@ dc_rights return v; } - private static Map allReflinks(final Collection links) { + private static Map allReflinks(final Collection links) { // links is either a Set of Strings (with urls) or // htmlFilterImageEntries // we find all links that are part of a reference inside a url - final Map v = new HashMap(); + final Map v = new HashMap(); final Iterator i = links.iterator(); Object o; - DigestURI url = null; + DigestURL url = null; String u; int pos; loop: while (i.hasNext()) try { o = i.next(); - if (o instanceof DigestURI) - url = (DigestURI) o; + if (o instanceof DigestURL) + url = (DigestURL) o; else if (o instanceof String) - url = new DigestURI((String) o); + url = new DigestURL((String) o); else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); else { @@ -627,7 +622,7 @@ dc_rights u = u.substring(pos); while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) u = u.substring(pos); - url = new DigestURI(u); + url = new DigestURL(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; @@ -637,7 +632,7 @@ dc_rights u = "http:/" + u.substring(pos); while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) u = "http:/" + u.substring(pos); - url = new DigestURI(u); + url = new DigestURL(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; @@ -659,23 +654,23 @@ dc_rights } FileUtils.copy(doc.getTextStream(), (ByteArrayOutputStream) this.text); - this.anchors.putAll(doc.getAnchors()); + this.anchors.addAll(doc.getAnchors()); this.rss.putAll(doc.getRSS()); - ContentScraper.addAllImages(this.images, doc.getImages()); + this.images.putAll(doc.getImages()); } } /** * @return the {@link URL} to the favicon that belongs to the document */ - public MultiProtocolURI getFavicon() { + public MultiProtocolURL getFavicon() { return this.favicon; } /** * @param faviconURL the {@link URL} to the favicon that belongs to the document */ - public void setFavicon(final MultiProtocolURI faviconURL) { + public void setFavicon(final MultiProtocolURL faviconURL) { this.favicon = faviconURL; } @@ -699,14 +694,14 @@ dc_rights return c; } - public Set inboundLinks() { + public LinkedHashMap inboundLinks() { if (this.inboundlinks == null) resortLinks(); - return (this.inboundlinks == null) ? null : this.inboundlinks.keySet(); + return (this.inboundlinks == null) ? null : this.inboundlinks; } - public Set outboundLinks() { + public LinkedHashMap outboundLinks() { if (this.outboundlinks == null) resortLinks(); - return (this.outboundlinks == null) ? null : this.outboundlinks.keySet(); + return (this.outboundlinks == null) ? null : this.outboundlinks; } public boolean indexingDenied() { @@ -774,7 +769,7 @@ dc_rights * @param docs * @return */ - public static Document mergeDocuments(final DigestURI location, final String globalMime, final Document[] docs) { + public static Document mergeDocuments(final DigestURL location, final String globalMime, final Document[] docs) { if (docs == null || docs.length == 0) return null; if (docs.length == 1) return docs[0]; @@ -786,9 +781,9 @@ dc_rights final List descriptions = new ArrayList(); final Collection titles = new LinkedHashSet(); final Collection sectionTitles = new LinkedHashSet(); - final Map anchors = new HashMap(); - final Map rss = new HashMap(); - final Map images = new HashMap(); + final List anchors = new ArrayList(); + final LinkedHashMap rss = new LinkedHashMap(); + final LinkedHashMap images = new LinkedHashMap(); double lon = 0.0d, lat = 0.0d; Date date = new Date(); @@ -825,9 +820,9 @@ dc_rights ConcurrentLog.logException(e); } } - anchors.putAll(doc.getAnchors()); + anchors.addAll(doc.getAnchors()); rss.putAll(doc.getRSS()); - ContentScraper.addAllImages(images, doc.getImages()); + images.putAll(doc.getImages()); if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); } if (doc.date.before(date)) date = doc.date; } @@ -865,24 +860,24 @@ dc_rights date); } - public static Map getHyperlinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getHyperlinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { result.putAll(d.getHyperlinks()); final Object parser = d.getParserObject(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; String refresh = html.getRefreshPath(); - if (refresh != null && refresh.length() > 0) try {result.put(new DigestURI(refresh), "refresh");} catch (final MalformedURLException e) {} - DigestURI canonical = html.getCanonical(); + if (refresh != null && refresh.length() > 0) try {result.put(new DigestURL(refresh), "refresh");} catch (final MalformedURLException e) {} + DigestURL canonical = html.getCanonical(); if (canonical != null) result.put(canonical, "canonical"); } } return result; } - public static Map getImagelinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getImagelinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { for (final ImageEntry imageReference : d.getImages().values()) { // construct a image name which contains the document title to enhance the search process for images @@ -892,30 +887,30 @@ dc_rights return result; } - public static Map getAudiolinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getAudiolinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.audiolinks.entrySet()) { + for (Map.Entry e: d.audiolinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } return result; } - public static Map getVideolinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getVideolinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.videolinks.entrySet()) { + for (Map.Entry e: d.videolinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } return result; } - public static Map getApplinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getApplinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.applinks.entrySet()) { + for (Map.Entry e: d.applinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } diff --git a/source/net/yacy/document/LibraryProvider.java b/source/net/yacy/document/LibraryProvider.java index 592e1ac0b..70dfecfdc 100644 --- a/source/net/yacy/document/LibraryProvider.java +++ b/source/net/yacy/document/LibraryProvider.java @@ -42,8 +42,8 @@ import java.util.TreeSet; import java.util.zip.ZipException; import java.util.zip.ZipFile; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.WordCache; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.geo.GeonamesLocation; import net.yacy.cora.geo.OpenGeoDBLocation; import net.yacy.cora.geo.OverarchingLocation; @@ -90,7 +90,7 @@ public class LibraryProvider { private Dictionary(final String nickname, final String url) { try { - this.filename = (new MultiProtocolURI(url)).getFileName(); + this.filename = (new MultiProtocolURL(url)).getFileName(); } catch (final MalformedURLException e ) { assert false; } diff --git a/source/net/yacy/document/Parser.java b/source/net/yacy/document/Parser.java index 74a80d1b6..f930dcfd5 100644 --- a/source/net/yacy/document/Parser.java +++ b/source/net/yacy/document/Parser.java @@ -26,8 +26,8 @@ package net.yacy.document; import java.io.InputStream; import java.util.Set; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; public interface Parser { @@ -54,7 +54,7 @@ public interface Parser { * @throws InterruptedException */ public Document[] parse( - DigestURI url, + DigestURL url, String mimeType, String charset, InputStream source @@ -91,22 +91,22 @@ public interface Parser { public class Failure extends Exception { private static final long serialVersionUID = 2278214953869122883L; - private MultiProtocolURI url = null; + private MultiProtocolURL url = null; public Failure() { super(); } - public Failure(final String message, final MultiProtocolURI url) { + public Failure(final String message, final MultiProtocolURL url) { super(message + "; url = " + url.toNormalform(true)); this.url = url; } - public Failure(final String message, final MultiProtocolURI url, Throwable e) { + public Failure(final String message, final MultiProtocolURL url, Throwable e) { super(message + "; url = " + url.toNormalform(true), e); this.url = url; } - public MultiProtocolURI getURL() { + public MultiProtocolURL getURL() { return this.url; } } diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 23fd860c4..52ebf8f7b 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -33,8 +33,9 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.parser.audioTagParser; import net.yacy.document.parser.bzipParser; import net.yacy.document.parser.csvParser; @@ -63,7 +64,6 @@ import net.yacy.document.parser.zipParser; import net.yacy.document.parser.augment.AugmentParser; import net.yacy.document.parser.images.genericImageParser; import net.yacy.document.parser.rdfa.impl.RDFaParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; import net.yacy.search.Switchboard; @@ -156,7 +156,7 @@ public final class TextParser { } public static Document[] parseSource( - final DigestURI location, + final DigestURL location, final String mimeType, final String charset, final File sourceFile @@ -186,7 +186,7 @@ public final class TextParser { } public static Document[] parseSource( - final DigestURI location, + final DigestURL location, String mimeType, final String charset, final byte[] content @@ -197,7 +197,7 @@ public final class TextParser { try { idioms = parsers(location, mimeType); } catch (final Parser.Failure e) { - final String errorMsg = "Parser Failure for extension '" + MultiProtocolURI.getFileExtension(location.getFileName()) + "' or mimetype '" + mimeType + "': " + e.getMessage(); + final String errorMsg = "Parser Failure for extension '" + MultiProtocolURL.getFileExtension(location.getFileName()) + "' or mimetype '" + mimeType + "': " + e.getMessage(); AbstractParser.log.warn(errorMsg); throw new Parser.Failure(errorMsg, location); } @@ -209,7 +209,7 @@ public final class TextParser { } public static Document[] parseSource( - final DigestURI location, + final DigestURL location, String mimeType, final String charset, final long contentLength, @@ -221,7 +221,7 @@ public final class TextParser { try { idioms = parsers(location, mimeType); } catch (final Parser.Failure e) { - final String errorMsg = "Parser Failure for extension '" + MultiProtocolURI.getFileExtension(location.getFileName()) + "' or mimetype '" + mimeType + "': " + e.getMessage(); + final String errorMsg = "Parser Failure for extension '" + MultiProtocolURL.getFileExtension(location.getFileName()) + "' or mimetype '" + mimeType + "': " + e.getMessage(); AbstractParser.log.warn(errorMsg); throw new Parser.Failure(errorMsg, location); } @@ -248,14 +248,14 @@ public final class TextParser { } private static Document[] parseSource( - final DigestURI location, + final DigestURL location, final String mimeType, final Parser parser, final String charset, final InputStream sourceStream ) throws Parser.Failure { if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream"); - final String fileExt = MultiProtocolURI.getFileExtension(location.getFileName()); + final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName()); final String documentCharset = htmlParser.patchCharsetEncoding(charset); assert parser != null; @@ -269,13 +269,13 @@ public final class TextParser { } private static Document[] parseSource( - final DigestURI location, + final DigestURL location, final String mimeType, final Set parsers, final String charset, final byte[] sourceArray ) throws Parser.Failure { - final String fileExt = MultiProtocolURI.getFileExtension(location.getFileName()); + final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName()); if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]"); final String documentCharset = htmlParser.patchCharsetEncoding(charset); assert !parsers.isEmpty(); @@ -337,7 +337,7 @@ public final class TextParser { * @param mimeType * @return returns null if the content is supported. If the content is not supported, return a error string. */ - public static String supports(final MultiProtocolURI url, final String mimeType) { + public static String supports(final MultiProtocolURL url, final String mimeType) { try { // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok. final Set idioms = parsers(url, mimeType); @@ -361,11 +361,11 @@ public final class TextParser { * @return a list of Idiom parsers that may be appropriate for the given criteria * @throws Parser.Failure */ - private static Set parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure { + private static Set parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure { final Set idioms = new LinkedHashSet(2); // LinkedSet to maintain order (genericParser should be last) // check extension - String ext = MultiProtocolURI.getFileExtension(url.getFileName()); + String ext = MultiProtocolURL.getFileExtension(url.getFileName()); Set idiom; if (ext != null && ext.length() > 0) { ext = ext.toLowerCase(); @@ -430,12 +430,12 @@ public final class TextParser { * @param extention * @return an error if the extension is not supported, null otherwise */ - public static String supportsExtension(final MultiProtocolURI url) { - return supportsExtension(MultiProtocolURI.getFileExtension(url.getFileName()).toLowerCase()); + public static String supportsExtension(final MultiProtocolURL url) { + return supportsExtension(MultiProtocolURL.getFileExtension(url.getFileName()).toLowerCase()); } - public static String mimeOf(final MultiProtocolURI url) { - return mimeOf(MultiProtocolURI.getFileExtension(url.getFileName())); + public static String mimeOf(final MultiProtocolURL url) { + return mimeOf(MultiProtocolURL.getFileExtension(url.getFileName())); } public static String mimeOf(final String ext) { diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index adb822bb2..42d57d33f 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -40,9 +40,9 @@ import java.util.TreeMap; import org.apache.solr.common.params.MultiMapSolrParams; import net.yacy.cora.date.ISO8601Formatter; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Document; -import net.yacy.kelondro.data.meta.DigestURI; public class DCEntry extends MultiMapSolrParams { @@ -61,7 +61,7 @@ public class DCEntry extends MultiMapSolrParams { } public DCEntry( - DigestURI url, + DigestURL url, Date date, String title, String author, @@ -115,7 +115,7 @@ public class DCEntry extends MultiMapSolrParams { } } - public DigestURI getIdentifier(boolean useRelationAsAlternative) { + public DigestURL getIdentifier(boolean useRelationAsAlternative) { String u = this.get("url"); if (u == null) u = this.get("dc:identifier"); if (u == null) return useRelationAsAlternative ? getRelation() : null; @@ -125,10 +125,10 @@ public class DCEntry extends MultiMapSolrParams { u = bestU(urls); } try { - return new DigestURI(u); + return new DigestURL(u); } catch (final MalformedURLException e) { if (useRelationAsAlternative) { - DigestURI relation = this.getRelation(); + DigestURL relation = this.getRelation(); if (relation != null) return relation; ConcurrentLog.warn("DCEntry", "getIdentifier: url is bad, relation also: " + e.getMessage()); } @@ -137,7 +137,7 @@ public class DCEntry extends MultiMapSolrParams { } } - public DigestURI getRelation() { + public DigestURL getRelation() { String u = this.get("dc:relation"); if (u == null) return null; String[] urls = u.split(";"); @@ -146,7 +146,7 @@ public class DCEntry extends MultiMapSolrParams { u = bestU(urls); } try { - return new DigestURI(u); + return new DigestURL(u); } catch (final MalformedURLException e) { ConcurrentLog.warn("DCEntry", "getRelation: url is bad: " + e.getMessage()); return null; diff --git a/source/net/yacy/document/content/dao/ImportDump.java b/source/net/yacy/document/content/dao/ImportDump.java index ff5a5d58b..1ae468a61 100644 --- a/source/net/yacy/document/content/dao/ImportDump.java +++ b/source/net/yacy/document/content/dao/ImportDump.java @@ -30,7 +30,7 @@ import java.io.IOException; import java.sql.SQLException; import java.sql.Statement; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.util.FileUtils; diff --git a/source/net/yacy/document/content/dao/PhpBB3Dao.java b/source/net/yacy/document/content/dao/PhpBB3Dao.java index 412b13ba8..d443ceac6 100644 --- a/source/net/yacy/document/content/dao/PhpBB3Dao.java +++ b/source/net/yacy/document/content/dao/PhpBB3Dao.java @@ -39,10 +39,10 @@ import java.util.HashMap; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.content.DCEntry; import net.yacy.document.content.SurrogateReader; -import net.yacy.kelondro.data.meta.DigestURI; public class PhpBB3Dao implements Dao { @@ -208,9 +208,9 @@ public class PhpBB3Dao implements Dao { } protected DCEntry parseResultSet(ResultSet rs) throws SQLException, MalformedURLException { - DigestURI url; + DigestURL url; int item = rs.getInt("post_id"); - url = new DigestURI(this.urlstub + "/viewtopic.php?t=" + item); + url = new DigestURL(this.urlstub + "/viewtopic.php?t=" + item); String subject = rs.getString("post_subject"); String text = xmlCleaner(rs.getString("post_text")); String user = getUser(rs.getInt("poster_id")); @@ -271,7 +271,7 @@ public class PhpBB3Dao implements Dao { ) { try { // generate output file name and attributes - String targethost = new DigestURI(this.urlstub).getHost(); + String targethost = new DigestURL(this.urlstub).getHost(); int fc = 0; File outputfiletmp = null, outputfile = null; diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 89ce40307..b6fa2f445 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -50,7 +50,8 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.zip.GZIPInputStream; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.NumberTools; @@ -60,7 +61,6 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.content.SurrogateReader; -import net.yacy.kelondro.data.meta.DigestURI; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; @@ -193,7 +193,7 @@ public class MediawikiImporter extends Thread implements Importer { q = this.urlStub.lastIndexOf('/'); if (q > 0) this.urlStub = this.urlStub.substring(0, q + 1); } - final DigestURI uri = new DigestURI(this.urlStub); + final DigestURL uri = new DigestURL(this.urlStub); this.hostport = uri.getHost(); if (uri.getPort() != 80) this.hostport += ":" + uri.getPort(); continue; @@ -501,7 +501,7 @@ public class MediawikiImporter extends Thread implements Importer { public class wikiparserrecord { public String title; String source, html, hostport, urlStub; - DigestURI url; + DigestURL url; Document document; public wikiparserrecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) { this.title = title; @@ -520,7 +520,7 @@ public class MediawikiImporter extends Thread implements Importer { } public void genDocument() throws Parser.Failure { try { - this.url = new DigestURI(this.urlStub + this.title); + this.url = new DigestURL(this.urlStub + this.title); final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html)); this.document = Document.mergeDocuments(this.url, "text/html", parsed); // the wiki parser is not able to find the proper title in the source text, so it must be set here diff --git a/source/net/yacy/document/importer/OAIListFriendsLoader.java b/source/net/yacy/document/importer/OAIListFriendsLoader.java index 13ad764bc..d492f9065 100644 --- a/source/net/yacy/document/importer/OAIListFriendsLoader.java +++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java @@ -39,12 +39,12 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Response; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.LoaderDispatcher; @@ -63,7 +63,7 @@ public class OAIListFriendsLoader implements Serializable { listFriends.putAll(moreFriends); if (loader != null) for (final Map.Entry oaiFriend: listFriends.entrySet()) { try { - loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null, agent); + loader.loadIfNotExistBackground(new DigestURL(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null, agent); } catch (final MalformedURLException e) { } } @@ -88,7 +88,7 @@ public class OAIListFriendsLoader implements Serializable { Map m; for (final Map.Entry oaiFriend: listFriends.entrySet()) try { if (!oaiFriend.getValue().exists()) { - final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, agent); + final Response response = loader == null ? null : loader.load(loader.request(new DigestURL(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, agent); if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue()); } diff --git a/source/net/yacy/document/importer/OAIPMHImporter.java b/source/net/yacy/document/importer/OAIPMHImporter.java index ae55b51ac..222ea279e 100644 --- a/source/net/yacy/document/importer/OAIPMHImporter.java +++ b/source/net/yacy/document/importer/OAIPMHImporter.java @@ -33,9 +33,9 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.Switchboard; @@ -53,7 +53,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable finishedJobs = new ConcurrentHashMap(); private final LoaderDispatcher loader; - private DigestURI source; + private DigestURL source; private int recordsCount, chunkCount, completeListSize; private final long startTime; private long finishTime; @@ -62,7 +62,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable { int recordCounter; - private final DigestURI source; + private final DigestURL source; - public ResumptionToken(final DigestURI source, final byte[] b) throws IOException { + public ResumptionToken(final DigestURL source, final byte[] b) throws IOException { super((Collator) insensitiveCollator.clone()); this.source = source; this.recordCounter = 0; @@ -105,7 +105,7 @@ public class ResumptionToken extends TreeMap { * @param url * @return a string containing the url up to and including the '?' */ - public static String truncatedURL(final DigestURI url) { + public static String truncatedURL(final DigestURL url) { String u = url.toNormalform(true); final int i = u.indexOf('?'); if (i > 0) u = u.substring(0, i + 1); @@ -126,7 +126,7 @@ public class ResumptionToken extends TreeMap { * @return * @throws IOException in case that no follow-up url can be generated; i.e. if the expiration date is exceeded */ - public DigestURI resumptionURL() throws IOException { + public DigestURL resumptionURL() throws IOException { // decide which kind of encoding strategy was used to get a resumptionToken: final String token = getToken(); @@ -136,7 +136,7 @@ public class ResumptionToken extends TreeMap { // encoded state if (token.indexOf("from=",0) >= 0) { - return new DigestURI(url + "verb=ListRecords&" + token); + return new DigestURL(url + "verb=ListRecords&" + token); } // cached result set @@ -147,7 +147,7 @@ public class ResumptionToken extends TreeMap { // the resumption token is still fresh } final String u = url + "verb=ListRecords&resumptionToken=" + escape(token); - return new DigestURI(u); + return new DigestURL(u); } public static StringBuilder escape(final String s) { diff --git a/source/net/yacy/document/parser/audioTagParser.java b/source/net/yacy/document/parser/audioTagParser.java index 82044c2cc..e405996d0 100644 --- a/source/net/yacy/document/parser/audioTagParser.java +++ b/source/net/yacy/document/parser/audioTagParser.java @@ -35,12 +35,12 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import org.jaudiotagger.audio.AudioFile; import org.jaudiotagger.audio.AudioFileIO; @@ -69,13 +69,13 @@ public class audioTagParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { String filename = location.getFileName(); - final String fileext = '.' + MultiProtocolURI.getFileExtension(filename); - filename = filename.isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(filename); + final String fileext = '.' + MultiProtocolURL.getFileExtension(filename); + filename = filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename); String mime = mimeType; // fix mimeType diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java index 0d4bff25f..803169617 100644 --- a/source/net/yacy/document/parser/augment/AugmentParser.java +++ b/source/net/yacy/document/parser/augment/AugmentParser.java @@ -6,13 +6,13 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Set; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.ymark.YMarkUtil; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.parser.rdfa.impl.RDFaParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; @@ -35,7 +35,7 @@ public class AugmentParser extends AbstractParser implements Parser { } @Override - public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(DigestURL url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source); @@ -58,7 +58,7 @@ public class AugmentParser extends AbstractParser implements Parser { } } */ - private void parseAndAugment(Document origDoc, DigestURI url, @SuppressWarnings("unused") String mimeType, @SuppressWarnings("unused") String charset) { + private void parseAndAugment(Document origDoc, DigestURL url, @SuppressWarnings("unused") String mimeType, @SuppressWarnings("unused") String charset) { Iterator it; try { diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 3f33f974f..4e2f0ab7b 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -31,11 +31,11 @@ import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; @@ -56,7 +56,7 @@ public class bzipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java index 6d50cbc93..b690485d9 100644 --- a/source/net/yacy/document/parser/csvParser.java +++ b/source/net/yacy/document/parser/csvParser.java @@ -33,10 +33,10 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; /** * a parser for comma-separated values @@ -51,7 +51,7 @@ public class csvParser extends AbstractParser implements Parser { } @Override - public Document[] parse(DigestURI location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(DigestURL location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { // construct a document using all cells of the document // the first row is used as headline // all lines are artificially terminated by a '.' to separate them as sentence for the condenser. diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 3a402a1c0..297adf3f2 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -30,10 +30,10 @@ package net.yacy.document.parser; import java.io.InputStream; import java.util.Date; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import org.apache.poi.hwpf.extractor.WordExtractor; @@ -54,7 +54,7 @@ public class docParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/dwgParser.java b/source/net/yacy/document/parser/dwgParser.java index 6289b1a57..7dff5137e 100644 --- a/source/net/yacy/document/parser/dwgParser.java +++ b/source/net/yacy/document/parser/dwgParser.java @@ -25,10 +25,10 @@ package net.yacy.document.parser; import java.io.InputStream; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.MemoryControl; import org.apache.poi.util.StringUtil; @@ -60,7 +60,7 @@ public class dwgParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, true)) diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index 8fa1d85ff..e285f0624 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -27,11 +27,11 @@ package net.yacy.document.parser; import java.io.InputStream; import java.util.Date; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; /** * this parser can parse just anything because it uses only the uri/file/path information @@ -45,7 +45,7 @@ public class genericParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source1) throws Parser.Failure, InterruptedException { String filename = location.getFileName(); @@ -56,7 +56,7 @@ public class genericParser extends AbstractParser implements Parser { this, null, null, - singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(filename)), // title + singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title "", // author location.getHost(), null, diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 287ea5108..72d9c952d 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -32,11 +32,11 @@ import java.io.FileOutputStream; import java.io.InputStream; import java.util.zip.GZIPInputStream; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; @@ -55,7 +55,7 @@ public class gzipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; Document[] docs = null; diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 1e9dd363c..7b36537fa 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -36,10 +36,10 @@ import java.util.ArrayList; import java.util.Date; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; -import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.regex.Matcher; @@ -48,7 +48,9 @@ import java.util.regex.Pattern; import javax.swing.event.EventListenerList; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.storage.SizeLimitedMap; import net.yacy.cora.storage.SizeLimitedSet; @@ -57,7 +59,6 @@ import net.yacy.cora.util.NumberTools; import net.yacy.document.SentenceReader; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.html.Evaluation.Element; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.ISO639; @@ -125,13 +126,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // class variables: collectors for links - private final Map anchors; - private final Map rss, css; - private final Set script, frames, iframes; - private final Map embeds; // urlhash/embed relation - private final Map images; // urlhash/image relation + private final List anchors; + private final LinkedHashMap rss, css; + private final LinkedHashMap embeds; // urlhash/embed relation + private final List images; + private final Set script, frames, iframes; private final Map metas; - private final Map hreflang, navigation; + private final Map hreflang, navigation; private LinkedHashSet titles; //private String headline; private List[] headlines; @@ -140,20 +141,20 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final CharBuffer content; private final EventListenerList htmlFilterEventListeners; private double lon, lat; - private DigestURI canonical, publisher; + private DigestURL canonical, publisher; private final int maxLinks; private int breadcrumbs; /** - * {@link MultiProtocolURI} to the favicon that belongs to the document + * {@link MultiProtocolURL} to the favicon that belongs to the document */ - private MultiProtocolURI favicon; + private MultiProtocolURL favicon; /** - * The document root {@link MultiProtocolURI} + * The document root {@link MultiProtocolURL} */ - private DigestURI root; + private DigestURL root; /** * evaluation scores: count appearance of specific attributes @@ -161,7 +162,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final Evaluation evaluationScores; @SuppressWarnings("unchecked") - public ContentScraper(final DigestURI root, int maxLinks) { + public ContentScraper(final DigestURL root, int maxLinks) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); @@ -169,17 +170,17 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.root = root; this.maxLinks = maxLinks; this.evaluationScores = new Evaluation(); - this.rss = new SizeLimitedMap(maxLinks); - this.css = new SizeLimitedMap(maxLinks); - this.anchors = new SizeLimitedMap(maxLinks); - this.images = new SizeLimitedMap(maxLinks); - this.embeds = new SizeLimitedMap(maxLinks); - this.frames = new SizeLimitedSet(maxLinks); - this.iframes = new SizeLimitedSet(maxLinks); + this.rss = new SizeLimitedMap(maxLinks); + this.css = new SizeLimitedMap(maxLinks); + this.anchors = new ArrayList(); + this.images = new ArrayList(); + this.embeds = new SizeLimitedMap(maxLinks); + this.frames = new SizeLimitedSet(maxLinks); + this.iframes = new SizeLimitedSet(maxLinks); this.metas = new SizeLimitedMap(maxLinks); - this.hreflang = new SizeLimitedMap(maxLinks); - this.navigation = new SizeLimitedMap(maxLinks); - this.script = new SizeLimitedSet(maxLinks); + this.hreflang = new SizeLimitedMap(maxLinks); + this.navigation = new SizeLimitedMap(maxLinks); + this.script = new SizeLimitedSet(maxLinks); this.titles = new LinkedHashSet(); this.headlines = new ArrayList[6]; for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList(); @@ -202,19 +203,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.content.trimToSize(); } - private void mergeAnchors(final DigestURI url, final Properties p) { - final Properties p0 = this.anchors.get(url); - if (p0 == null) { - this.anchors.put(url, p); - return; - } - // merge properties - for (final Entry entry: p.entrySet()) { - if (entry.getValue() != null && entry.getValue().toString().length() > 0) p0.put(entry.getKey(), entry.getValue()); - } - this.anchors.put(url, p0); - } - @Override public void scrapeText(final char[] newtext, final String insideTag) { // System.out.println("SCRAPE: " + UTF8.String(newtext)); @@ -290,7 +278,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { // find http links inside text s = 0; String u; - DigestURI url; while (s < b.length()) { p = find(b, dpssp, s); if (p == Integer.MAX_VALUE) break; @@ -302,8 +289,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above s = p + 6; try { - url = new DigestURI(u); - mergeAnchors(url, new Properties()); + this.anchors.add(new AnchorURL(u)); continue; } catch (final MalformedURLException e) {} } @@ -325,9 +311,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { return (p < 0) ? Integer.MAX_VALUE : p; } - private DigestURI absolutePath(final String relativePath) { + private AnchorURL absolutePath(final String relativePath) { try { - return DigestURI.newURL(this.root, relativePath); + return AnchorURL.newAnchor(this.root, relativePath); } catch (final Exception e) { return null; } @@ -339,24 +325,25 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String src = tagopts.getProperty("src", EMPTY_STRING); try { if (src.length() > 0) { - final DigestURI url = absolutePath(src); + final DigestURL url = absolutePath(src); if (url != null) { final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1); - addImage(this.images, ie); + this.images.add(ie); } } } catch (final NumberFormatException e) {} this.evaluationScores.match(Element.imgpath, src); } else if(tagname.equalsIgnoreCase("base")) { try { - this.root = new DigestURI(tagopts.getProperty("href", EMPTY_STRING)); + this.root = new DigestURL(tagopts.getProperty("href", EMPTY_STRING)); } catch (final MalformedURLException e) {} } else if (tagname.equalsIgnoreCase("frame")) { - final DigestURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); + final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); tagopts.put("src", src.toNormalform(true)); - mergeAnchors(src, tagopts /* with property "name" */); + src.getProperties().putAll(tagopts); + this.anchors.add(src); this.frames.add(src); this.evaluationScores.match(Element.framepath, src.toNormalform(true)); } else if (tagname.equalsIgnoreCase("body")) { @@ -392,13 +379,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String href = tagopts.getProperty("href", EMPTY_STRING); if (href.length() > 0) { tagopts.put("nme", areatitle); - DigestURI url = absolutePath(href); + AnchorURL url = absolutePath(href); tagopts.put("href", url.toNormalform(true)); - mergeAnchors(url, tagopts); + url.getProperties().putAll(tagopts); + this.anchors.add(url); } } else if (tagname.equalsIgnoreCase("link")) { final String href = tagopts.getProperty("href", EMPTY_STRING); - final DigestURI newLink = absolutePath(href); + final AnchorURL newLink = absolutePath(href); if (newLink != null) { tagopts.put("href", newLink.toNormalform(true)); @@ -409,11 +397,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (rel.equalsIgnoreCase("shortcut icon")) { final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1); - this.images.put(ie.url(), ie); + this.images.add(ie); this.favicon = newLink; } else if (rel.equalsIgnoreCase("canonical")) { tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next()); - mergeAnchors(newLink, tagopts); + newLink.getProperties().putAll(tagopts); + this.anchors.add(newLink); this.canonical = newLink; } else if (rel.equalsIgnoreCase("publisher")) { this.publisher = newLink; @@ -428,35 +417,39 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.evaluationScores.match(Element.csspath, href); } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) { tagopts.put("name", linktitle); - mergeAnchors(newLink, tagopts); + newLink.getProperties().putAll(tagopts); + this.anchors.add(newLink); } } } else if(tagname.equalsIgnoreCase("embed")) { final String src = tagopts.getProperty("src", EMPTY_STRING); try { if (src.length() > 0) { - final DigestURI url = absolutePath(src); + final AnchorURL url = absolutePath(src); if (url != null) { final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); tagopts.put("src", url.toNormalform(true)); final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING)); this.embeds.put(url, ie); - mergeAnchors(url, tagopts); + url.getProperties().putAll(tagopts); + this.anchors.add(url); } } } catch (final NumberFormatException e) {} } else if(tagname.equalsIgnoreCase("param")) { final String name = tagopts.getProperty("name", EMPTY_STRING); if (name.equalsIgnoreCase("movie")) { - DigestURI url = absolutePath(tagopts.getProperty("value", EMPTY_STRING)); + AnchorURL url = absolutePath(tagopts.getProperty("value", EMPTY_STRING)); tagopts.put("value", url.toNormalform(true)); - mergeAnchors(url, tagopts /* with property "name" */); + url.getProperties().putAll(tagopts); + this.anchors.add(url); } } else if (tagname.equalsIgnoreCase("iframe")) { - final DigestURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); + final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); tagopts.put("src", src.toNormalform(true)); - mergeAnchors(src, tagopts /* with property "name" */); + src.getProperties().putAll(tagopts); + this.anchors.add(src); this.iframes.add(src); this.evaluationScores.match(Element.iframepath, src.toNormalform(true)); } else if (tagname.equalsIgnoreCase("html")) { @@ -474,58 +467,59 @@ public class ContentScraper extends AbstractScraper implements Scraper { // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text)); if (tagname.equalsIgnoreCase("a") && text.length < 2048) { final String href = tagopts.getProperty("href", EMPTY_STRING); - DigestURI url; + AnchorURL url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { - final String ext = MultiProtocolURI.getFileExtension(url.getFileName()); + final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); if (ext.equals("png") || ext.equals("gif") || ext.equals("jpg") || ext.equals("jpeg") || ext.equals("tiff") || ext.equals("tif")) { // special handling of such urls: put them to the image urls - final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1); - addImage(this.images, ie); + final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1); + this.images.add(ie); } else { - tagopts.put("text", recursiveParse(text)); + tagopts.put("text", recursiveParse(url, text)); tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute - mergeAnchors(url, tagopts); + url.getProperties().putAll(tagopts); + this.anchors.add(url); } } this.evaluationScores.match(Element.apath, href); } final String h; if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.headlines[0].add(h); } else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.headlines[1].add(h); } else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.headlines[2].add(h); } else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.headlines[3].add(h); } else if ((tagname.equalsIgnoreCase("h5")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.headlines[4].add(h); } else if ((tagname.equalsIgnoreCase("h6")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.headlines[5].add(h); } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) { - String t = recursiveParse(text); + String t = recursiveParse(null, text); this.titles.add(t); this.evaluationScores.match(Element.title, t); } else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.bold.inc(h); } else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.bold.inc(h); } else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.italic.inc(h); } else if ((tagname.equalsIgnoreCase("u")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.underline.inc(h); } else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) { - h = recursiveParse(text); + h = recursiveParse(null, text); if (h.length() > 0) this.li.add(h); } else if (tagname.equalsIgnoreCase("script")) { final String src = tagopts.getProperty("src", EMPTY_STRING); @@ -547,7 +541,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.evaluationScores.match(Element.comment, LB.matcher(new String(comment)).replaceAll(" ")); } - private String recursiveParse(final char[] inlineHtml) { + private String recursiveParse(final DigestURL linkurl, final char[] inlineHtml) { if (inlineHtml.length < 14) return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml))); // start a new scraper to parse links inside this text @@ -565,10 +559,16 @@ public class ContentScraper extends AbstractScraper implements Scraper { } catch (final IOException e) { } } - for (final Map.Entry entry: scraper.getAnchors().entrySet()) { - mergeAnchors(entry.getKey(), entry.getValue()); + for (final AnchorURL entry: scraper.getAnchors()) { + this.anchors.add(entry); + } + for (ImageEntry ie: scraper.images) { + if (linkurl != null) { + ie.setLinkurl(linkurl); + ie.setAnchortext(new String(inlineHtml)); + } + this.images.add(ie); } - this.images.putAll(scraper.images); String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars()))); scraper.close(); @@ -647,21 +647,21 @@ public class ContentScraper extends AbstractScraper implements Scraper { return this.li.toArray(new String[this.li.size()]); } - public DigestURI[] getFlash() { + public DigestURL[] getFlash() { String ext; - ArrayList f = new ArrayList(); - for (final DigestURI url: this.anchors.keySet()) { - ext = MultiProtocolURI.getFileExtension(url.getFileName()); + ArrayList f = new ArrayList(); + for (final DigestURL url: this.anchors) { + ext = MultiProtocolURL.getFileExtension(url.getFileName()); if (ext == null) continue; if (ext.equals("swf")) f.add(url); } - return f.toArray(new DigestURI[f.size()]); + return f.toArray(new DigestURL[f.size()]); } public boolean containsFlash() { String ext; - for (final MultiProtocolURI url: this.anchors.keySet()) { - ext = MultiProtocolURI.getFileExtension(url.getFileName()); + for (final MultiProtocolURL url: this.anchors) { + ext = MultiProtocolURL.getFileExtension(url.getFileName()); if (ext == null) continue; if (ext.equals("swf")) return true; } @@ -681,48 +681,48 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } - public Map getAnchors() { + public List getAnchors() { // returns a url (String) / name (String) relation return this.anchors; } - public Map getRSS() { + public LinkedHashMap getRSS() { // returns a url (String) / name (String) relation return this.rss; } - public Map getCSS() { + public Map getCSS() { // returns a url (String) / name (String) relation return this.css; } - public Set getFrames() { + public Set getFrames() { // returns a url (String) / name (String) relation return this.frames; } - public Set getIFrames() { + public Set getIFrames() { // returns a url (String) / name (String) relation return this.iframes; } - public Set getScript() { + public Set getScript() { return this.script; } - public DigestURI getCanonical() { + public DigestURL getCanonical() { return this.canonical; } - public DigestURI getPublisherLink() { + public DigestURL getPublisherLink() { return this.publisher; } - public Map getHreflang() { + public Map getHreflang() { return this.hreflang; } - public Map getNavigation() { + public Map getNavigation() { return this.navigation; } @@ -730,11 +730,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { * get all images * @return a map of */ - public Map getImages() { + public List getImages() { return this.images; } - public Map getEmbeds() { + public Map getEmbeds() { return this.embeds; } @@ -743,9 +743,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { } /** - * @return the {@link MultiProtocolURI} to the favicon that belongs to the document + * @return the {@link MultiProtocolURL} to the favicon that belongs to the document */ - public MultiProtocolURI getFavicon() { + public MultiProtocolURL getFavicon() { return this.favicon; } @@ -1013,36 +1013,18 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (page == null) throw new IOException("no content in file " + file.toString()); // scrape document to look up charset - final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new DigestURI("http://localhost"),null,false, maxLinks); + final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new DigestURL("http://localhost"),null,false, maxLinks); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); htmlFilter.close(); if (charset == null) charset = Charset.defaultCharset().toString(); // scrape content - final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost"), maxLinks); + final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); writer.close(); return scraper; } - public static void addAllImages(final Map a, final Map b) { - final Iterator> i = b.entrySet().iterator(); - Map.Entry ie; - while (i.hasNext()) { - ie = i.next(); - addImage(a, ie.getValue()); - } - } - - public static void addImage(final Map a, final ImageEntry ie) { - if (a.containsKey(ie.url())) { - // in case of a collision, take that image that has the better image size tags - if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie); - } else { - a.put(ie.url(), ie); - } - } - } diff --git a/source/net/yacy/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java index 22f61a088..e4dbf6238 100644 --- a/source/net/yacy/document/parser/html/ContentTransformer.java +++ b/source/net/yacy/document/parser/html/ContentTransformer.java @@ -32,7 +32,7 @@ import java.util.ArrayList; import java.util.Properties; import java.util.TreeSet; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.kelondro.io.CharBuffer; public class ContentTransformer extends AbstractTransformer implements Transformer { diff --git a/source/net/yacy/document/parser/html/EmbedEntry.java b/source/net/yacy/document/parser/html/EmbedEntry.java index cbaaffb2a..36134a273 100644 --- a/source/net/yacy/document/parser/html/EmbedEntry.java +++ b/source/net/yacy/document/parser/html/EmbedEntry.java @@ -20,15 +20,15 @@ package net.yacy.document.parser.html; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.id.DigestURL; public class EmbedEntry { - private final DigestURI url; + private final DigestURL url; private final int width, height; private final String type, pluginspage; - public EmbedEntry(final DigestURI url, int width, int height, String type, String pluginspage) { + public EmbedEntry(final DigestURL url, int width, int height, String type, String pluginspage) { this.url = url; this.width = width; this.height = height; @@ -36,7 +36,7 @@ public class EmbedEntry { this.pluginspage = pluginspage; } - public DigestURI getUrl() { + public DigestURL getUrl() { return this.url; } diff --git a/source/net/yacy/document/parser/html/ImageEntry.java b/source/net/yacy/document/parser/html/ImageEntry.java index 37419fffc..f1d160f05 100644 --- a/source/net/yacy/document/parser/html/ImageEntry.java +++ b/source/net/yacy/document/parser/html/ImageEntry.java @@ -26,26 +26,62 @@ package net.yacy.document.parser.html; import java.util.Comparator; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.id.DigestURL; public class ImageEntry implements Comparable, Comparator { - private final DigestURI url; + private final DigestURL imageurl; + private DigestURL linkurl; private final String alt; + private String anchortext; private final int width, height; private final long fileSize; - public ImageEntry(final DigestURI url, final String alt, final int width, final int height, long fileSize) { - assert url != null; - this.url = url; + /** + * an ImageEntry represents the appearance of an image in a document. It considers also that an image can be used as an button for a web link + * and stores the web link also. + * @param imageurl the link to the image + * @param linkurl the link which is called when the image is pressed on a web browser. null if the image was not used as link button + * @param anchortext the text inside the anchor body where the image link appears (including the image tag). null if the image was not used as link button + * @param alt the als text in the alt tag + * @param width the width of the image if known, or -1 if unknown + * @param height the height of the image if known, or -1 if unknown + * @param fileSize the number of bytes that the image uses on file or -1 if unknown + */ + public ImageEntry( + final DigestURL imageurl, + final String alt, + final int width, + final int height, + long fileSize) { + assert imageurl != null; + this.imageurl = imageurl; + this.linkurl = null; + this.anchortext = null; this.alt = alt; this.width = width; this.height = height; this.fileSize = fileSize; } - public DigestURI url() { - return this.url; + public DigestURL url() { + return this.imageurl; + } + + public void setLinkurl(DigestURL linkurl) { + this.linkurl = linkurl; + } + + public DigestURL linkurl() { + return this.linkurl; + } + + public void setAnchortext(String anchortext) { + this.anchortext = anchortext; + } + + public String anchortext() { + return this.anchortext; } public String alt() { @@ -66,7 +102,8 @@ public class ImageEntry implements Comparable, Comparator 0 ? " alt=\"" + this.alt + "\"" : "") + (this.width >= 0 ? " width=\"" + this.width + "\"" : "") + (this.height >= 0 ? " height=\"" + this.height + "\"" : "") + @@ -80,8 +117,8 @@ public class ImageEntry implements Comparable, Comparator> 9) & 0x7FFF)) << 16) | (this.url.hashCode() & 0xFFFF); + return /*0x7FFF0000 |*/ (this.imageurl.hashCode() & 0xFFFF); + return ((0x7FFF - (((this.width * this.height) >> 9) & 0x7FFF)) << 16) | (this.imageurl.hashCode() & 0xFFFF); } @Override @@ -90,13 +127,13 @@ public class ImageEntry implements Comparable, Comparator ohc) return 1; - return this.url.toString().compareTo((h).url.toString()); + return this.imageurl.toString().compareTo((h).imageurl.toString()); } @Override diff --git a/source/net/yacy/document/parser/html/ScraperInputStream.java b/source/net/yacy/document/parser/html/ScraperInputStream.java index a19b1388c..ac2a763f8 100644 --- a/source/net/yacy/document/parser/html/ScraperInputStream.java +++ b/source/net/yacy/document/parser/html/ScraperInputStream.java @@ -35,7 +35,7 @@ import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.Properties; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.id.DigestURL; public class ScraperInputStream extends InputStream implements ScraperListener { @@ -58,7 +58,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener { public ScraperInputStream( final InputStream inStream, final String inputStreamCharset, - final DigestURI rooturl, + final DigestURL rooturl, final Transformer transformer, final boolean passbyIfBinarySuspect, final int maxLinks diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index 175ac1934..f26161be8 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -44,9 +44,9 @@ import java.nio.charset.Charset; import java.util.Enumeration; import java.util.Properties; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; @@ -545,7 +545,7 @@ public final class TransformerWriter extends Writer { System.exit(0); final char[] buffer = new char[512]; try { - final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost:8090"), 1000); + final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost:8090"), 1000); final Transformer transformer = new ContentTransformer(); final Reader is = new FileReader(args[0]); final FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out")); diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 2b5c05f93..ca1fd935f 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -32,17 +32,19 @@ import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; +import java.util.LinkedHashMap; import java.util.regex.Pattern; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.document.parser.html.TransformerWriter; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import com.ibm.icu.text.CharsetDetector; @@ -86,7 +88,7 @@ public class htmlParser extends AbstractParser implements Parser { @Override public Document[] parse( - final DigestURI location, + final DigestURL location, final String mimeType, final String documentCharset, final InputStream sourceStream) throws Parser.Failure, InterruptedException { @@ -110,7 +112,7 @@ public class htmlParser extends AbstractParser implements Parser { * @param scraper * @return */ - private static Document transformScraper(final DigestURI location, final String mimeType, final String charSet, final ContentScraper scraper) { + private static Document transformScraper(final DigestURL location, final String mimeType, final String charSet, final ContentScraper scraper) { final String[] sections = new String[ scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + @@ -124,6 +126,8 @@ public class htmlParser extends AbstractParser implements Parser { sections[p++] = headline; } } + LinkedHashMap noDoubleImages = new LinkedHashMap(); + for (ImageEntry ie: scraper.getImages()) noDoubleImages.put(ie.url(), ie); final Document ppd = new Document( location, mimeType, @@ -140,7 +144,7 @@ public class htmlParser extends AbstractParser implements Parser { scraper.getText(), scraper.getAnchors(), scraper.getRSS(), - scraper.getImages(), + noDoubleImages, scraper.indexingDenied(), scraper.getDate()); ppd.setFavicon(scraper.getFavicon()); @@ -149,7 +153,7 @@ public class htmlParser extends AbstractParser implements Parser { } public static ContentScraper parseToScraper( - final DigestURI location, + final DigestURL location, final String documentCharset, InputStream sourceStream, final int maxLinks) throws Parser.Failure, IOException { @@ -297,9 +301,9 @@ public class htmlParser extends AbstractParser implements Parser { public static void main(final String[] args) { // test parsing of a url - DigestURI url; + DigestURL url; try { - url = new DigestURI(args[0]); + url = new DigestURL(args[0]); final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent); final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content)); final String title = document[0].dc_title(); diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 39bf0df8d..7deeb71b7 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -39,20 +39,21 @@ import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; -import java.util.Properties; import java.util.Set; import javax.imageio.ImageIO; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.images.bmpParser.IMAGEMAP; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import com.drew.imaging.jpeg.JpegProcessingException; @@ -94,7 +95,7 @@ public class genericImageParser extends AbstractParser implements Parser { @Override public Document[] parse( - final DigestURI location, + final DigestURL location, final String mimeType, final String documentCharset, final InputStream sourceStream) throws Parser.Failure, InterruptedException { @@ -105,7 +106,7 @@ public class genericImageParser extends AbstractParser implements Parser { String keywords = null; List descriptions = new ArrayList(); String filename = location.getFileName(); - String ext = MultiProtocolURI.getFileExtension(filename); + String ext = MultiProtocolURL.getFileExtension(filename); double gpslat = 0; double gpslon = 0; if (mimeType.equals("image/bmp") || ext.equals("bmp")) { @@ -197,13 +198,13 @@ public class genericImageParser extends AbstractParser implements Parser { } final HashSet languages = new HashSet(); - final HashMap anchors = new HashMap(); - final HashMap images = new HashMap(); + final List anchors = new ArrayList(); + final LinkedHashMap images = new LinkedHashMap(); // add this image to the map of images final String infoString = ii.info.toString(); images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1)); - if (title == null || title.isEmpty()) title = MultiProtocolURI.unescape(filename); + if (title == null || title.isEmpty()) title = MultiProtocolURL.unescape(filename); return new Document[]{new Document( location, @@ -237,7 +238,7 @@ public class genericImageParser extends AbstractParser implements Parser { } public static ImageInfo parseJavaImage( - final DigestURI location, + final DigestURL location, final InputStream sourceStream) throws Parser.Failure { BufferedImage image = null; try { @@ -252,7 +253,7 @@ public class genericImageParser extends AbstractParser implements Parser { } public static ImageInfo parseJavaImage( - final DigestURI location, + final DigestURL location, final BufferedImage image) { final ImageInfo ii = new ImageInfo(location); ii.image = image; @@ -289,12 +290,12 @@ public class genericImageParser extends AbstractParser implements Parser { } public static class ImageInfo { - public DigestURI location; + public DigestURL location; public BufferedImage image; public StringBuilder info; public int height; public int width; - public ImageInfo(final DigestURI location) { + public ImageInfo(final DigestURL location) { this.location = location; this.image = null; this.info = new StringBuilder(); @@ -308,10 +309,10 @@ public class genericImageParser extends AbstractParser implements Parser { public static void main(final String[] args) { final File image = new File(args[0]); final genericImageParser parser = new genericImageParser(); - DigestURI uri; + DigestURL uri; try { - uri = new DigestURI("http://localhost/" + image.getName()); - final Document[] document = parser.parse(uri, "image/" + MultiProtocolURI.getFileExtension(uri.getFileName()), "UTF-8", new FileInputStream(image)); + uri = new DigestURL("http://localhost/" + image.getName()); + final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new FileInputStream(image)); System.out.println(document[0].toString()); } catch (final MalformedURLException e) { e.printStackTrace(); diff --git a/source/net/yacy/document/parser/mmParser.java b/source/net/yacy/document/parser/mmParser.java index 773e0a71f..a0bcc6ceb 100644 --- a/source/net/yacy/document/parser/mmParser.java +++ b/source/net/yacy/document/parser/mmParser.java @@ -34,11 +34,11 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import org.xml.sax.Attributes; import org.xml.sax.SAXException; @@ -70,7 +70,7 @@ public class mmParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 19dcab095..c30f03c62 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -42,13 +42,13 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.parser.xml.ODContentHandler; import net.yacy.document.parser.xml.ODMetaHandler; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; @@ -107,7 +107,7 @@ public class odtParser extends AbstractParser implements Parser { return parser; } - private Document[] parse(final DigestURI location, final String mimeType, @SuppressWarnings("unused") final String charset, final File dest) + private Document[] parse(final DigestURL location, final String mimeType, @SuppressWarnings("unused") final String charset, final File dest) throws Parser.Failure, InterruptedException { CharBuffer writer = null; @@ -214,7 +214,7 @@ public class odtParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 0f473956d..d4a52ae24 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -42,14 +42,14 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.parser.xml.ODContentHandler; import net.yacy.document.parser.xml.ODMetaHandler; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; @@ -91,7 +91,7 @@ public class ooxmlParser extends AbstractParser implements Parser { return parser; } - private Document[] parse(final DigestURI location, final String mimeType, @SuppressWarnings("unused") final String charset, final File dest) throws Parser.Failure, InterruptedException { + private Document[] parse(final DigestURL location, final String mimeType, @SuppressWarnings("unused") final String charset, final File dest) throws Parser.Failure, InterruptedException { CharBuffer writer = null; try { @@ -201,7 +201,7 @@ public class ooxmlParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index dbf6d1991..b759798ec 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -55,12 +55,12 @@ import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.PDType3Font; import org.apache.pdfbox.util.PDFTextStripper; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; @@ -84,7 +84,7 @@ public class pdfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) @@ -141,7 +141,7 @@ public class pdfParser extends AbstractParser implements Parser { info = null; if (docTitle == null || docTitle.isEmpty()) { - docTitle = MultiProtocolURI.unescape(location.getFileName()); + docTitle = MultiProtocolURL.unescape(location.getFileName()); } final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index 59c770627..4f0128b6a 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -31,11 +31,11 @@ import java.io.BufferedInputStream; import java.io.InputStream; import java.util.Date; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import org.apache.poi.hslf.extractor.PowerPointExtractor; @@ -60,7 +60,7 @@ public class pptParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { diff --git a/source/net/yacy/document/parser/psParser.java b/source/net/yacy/document/parser/psParser.java index f112927e2..707b7a3ed 100644 --- a/source/net/yacy/document/parser/psParser.java +++ b/source/net/yacy/document/parser/psParser.java @@ -36,10 +36,10 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.util.Date; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; @@ -84,7 +84,7 @@ public class psParser extends AbstractParser implements Parser { } - private Document[] parse(final DigestURI location, final String mimeType, @SuppressWarnings("unused") final String charset, final File sourceFile) throws Parser.Failure, InterruptedException { + private Document[] parse(final DigestURL location, final String mimeType, @SuppressWarnings("unused") final String charset, final File sourceFile) throws Parser.Failure, InterruptedException { File outputFile = null; try { @@ -256,7 +256,7 @@ public class psParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/rdfParser.java b/source/net/yacy/document/parser/rdfParser.java index d14dacc37..5079f0475 100644 --- a/source/net/yacy/document/parser/rdfParser.java +++ b/source/net/yacy/document/parser/rdfParser.java @@ -30,10 +30,10 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; public class rdfParser extends AbstractParser implements Parser { @@ -45,7 +45,7 @@ public class rdfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI url, final String mimeType, + public Document[] parse(final DigestURL url, final String mimeType, final String charset, final InputStream source) throws Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java index 0820cd84c..cf8accb70 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java @@ -17,13 +17,13 @@ import java.util.Date; import java.util.HashSet; import java.util.Set; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.rdfa.IRDFaTriple; -import net.yacy.kelondro.data.meta.DigestURI; /** * @author fgandon @@ -46,7 +46,7 @@ public class RDFaParser extends AbstractParser implements Parser { } @Override - public Document[] parse(DigestURI url, String mimeType, + public Document[] parse(DigestURL url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { @@ -67,7 +67,7 @@ public class RDFaParser extends AbstractParser implements Parser { return htmlDocs; } - private static Document parseRDFa(DigestURI url, String mimeType, + private static Document parseRDFa(DigestURL url, String mimeType, String charset, InputStream source) { RDFaTripleImpl triple; IRDFaTriple[] allTriples = null; @@ -95,7 +95,7 @@ public class RDFaParser extends AbstractParser implements Parser { return doc; } - private Document[] parseHtml(DigestURI url, String mimeType, + private Document[] parseHtml(DigestURL url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { @@ -110,7 +110,7 @@ public class RDFaParser extends AbstractParser implements Parser { return htmlDocs; } - private static Document convertAllTriplesToDocument(DigestURI url, + private static Document convertAllTriplesToDocument(DigestURL url, String mimeType, String charset, IRDFaTriple[] allTriples) { //Set languages = new HashSet(2); @@ -178,7 +178,7 @@ public class RDFaParser extends AbstractParser implements Parser { if (aReader != null) { RDFaParser aParser = new RDFaParser(); try { - aParser.parse(new DigestURI(args[0]),"","",aURL.openStream()); + aParser.parse(new DigestURL(args[0]),"","",aURL.openStream()); } catch (final FileNotFoundException e) { e.printStackTrace(); } catch (final IOException e) { diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index af8cb186e..231883904 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -29,22 +29,21 @@ import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; -import java.util.Properties; import java.util.Set; -import net.yacy.cora.document.Hit; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.RSSReader; +import net.yacy.cora.document.feed.Hit; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSReader; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; public class rssParser extends AbstractParser implements Parser { @@ -59,7 +58,7 @@ public class rssParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI url, final String mimeType, + public Document[] parse(final DigestURL url, final String mimeType, final String charset, final InputStream source) throws Failure, InterruptedException { RSSReader rssReader; @@ -72,18 +71,17 @@ public class rssParser extends AbstractParser implements Parser { final RSSFeed feed = rssReader.getFeed(); //RSSMessage channel = feed.getChannel(); final List docs = new ArrayList(); - DigestURI uri; + AnchorURL uri; Set languages; - Map anchors; + List anchors; Document doc; for (final Hit item: feed) try { - uri = new DigestURI(item.getLink()); + uri = new AnchorURL(item.getLink()); languages = new HashSet(); languages.add(item.getLanguage()); - anchors = new HashMap(); - Properties p = new Properties(); - p.put("name", item.getTitle()); - anchors.put(uri, p); + anchors = new ArrayList(); + uri.getProperties().put("name", item.getTitle()); + anchors.add(uri); doc = new Document( uri, TextParser.mimeOf(url), @@ -101,7 +99,7 @@ public class rssParser extends AbstractParser implements Parser { null, anchors, null, - new HashMap(), + new LinkedHashMap(), false, item.getPubDate()); docs.add(doc); diff --git a/source/net/yacy/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java index 1a1495aa7..1ac87a76a 100644 --- a/source/net/yacy/document/parser/rtfParser.java +++ b/source/net/yacy/document/parser/rtfParser.java @@ -33,10 +33,10 @@ import java.util.Date; import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; public class rtfParser extends AbstractParser implements Parser { @@ -52,7 +52,7 @@ public class rtfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index b7c7464d0..b58d7e6a7 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -34,12 +34,12 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.Date; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import SevenZip.ArchiveExtractCallback; import SevenZip.IInStream; @@ -55,7 +55,7 @@ public class sevenzipParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed"); } - public Document parse(final DigestURI location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException { + public Document parse(final DigestURL location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException { final Document doc = new Document( location, mimeType, @@ -100,12 +100,12 @@ public class sevenzipParser extends AbstractParser implements Parser { } } - public Document parse(final DigestURI location, final String mimeType, final String charset, final byte[] source) throws Parser.Failure, InterruptedException { + public Document parse(final DigestURL location, final String mimeType, final String charset, final byte[] source) throws Parser.Failure, InterruptedException { return parse(location, mimeType, charset, new ByteArrayIInStream(source)); } @Override - public Document[] parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); FileUtils.copy(source, cfos); @@ -169,7 +169,7 @@ public class sevenzipParser extends AbstractParser implements Parser { Document[] theDocs; // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects - final DigestURI url = DigestURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath); + final DigestURL url = DigestURL.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); diff --git a/source/net/yacy/document/parser/sidAudioParser.java b/source/net/yacy/document/parser/sidAudioParser.java index aa3cf643d..799ef60a2 100644 --- a/source/net/yacy/document/parser/sidAudioParser.java +++ b/source/net/yacy/document/parser/sidAudioParser.java @@ -31,10 +31,10 @@ import java.util.Date; import java.util.HashMap; import java.util.Map; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; // this is a new implementation of this parser idiom using multiple documents as result set @@ -57,7 +57,7 @@ public class sidAudioParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index ce42595fa..17a472f6d 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -31,7 +31,7 @@ import java.net.MalformedURLException; import java.text.ParseException; import java.util.ArrayList; import java.util.Date; -import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; @@ -40,6 +40,7 @@ import java.util.zip.GZIPInputStream; import javax.xml.parsers.DocumentBuilderFactory; import net.yacy.cora.date.ISO8601Formatter; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; @@ -50,7 +51,6 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.ByteCountInputStream; import org.w3c.dom.CharacterData; @@ -68,17 +68,17 @@ public class sitemapParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI url, final String mimeType, + public Document[] parse(final DigestURL url, final String mimeType, final String charset, final InputStream source) throws Failure, InterruptedException { final List docs = new ArrayList(); SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent); sitemap.start(); - DigestURI uri; + DigestURL uri; Document doc; URLEntry item; while ((item = sitemap.take()) != POISON_URLEntry) try { - uri = new DigestURI(item.loc); + uri = new DigestURL(item.loc); doc = new Document( uri, TextParser.mimeOf(url), @@ -95,7 +95,7 @@ public class sitemapParser extends AbstractParser implements Parser { null, null, null, - new HashMap(), + new LinkedHashMap(), false, new Date()); docs.add(doc); @@ -108,7 +108,7 @@ public class sitemapParser extends AbstractParser implements Parser { return da; } - public static SitemapReader parse(final DigestURI sitemapURL, final ClientIdentification.Agent agent) throws IOException { + public static SitemapReader parse(final DigestURL sitemapURL, final ClientIdentification.Agent agent) throws IOException { // download document ConcurrentLog.info("SitemapReader", "loading sitemap from " + sitemapURL.toNormalform(true)); final RequestHeader requestHeader = new RequestHeader(); @@ -160,7 +160,7 @@ public class sitemapParser extends AbstractParser implements Parser { String url = new SitemapEntry((Element) sitemapNodes.item(i)).url(); if (url != null && url.length() > 0) { try { - final SitemapReader r = parse(new DigestURI(url), agent); + final SitemapReader r = parse(new DigestURL(url), agent); r.start(); URLEntry item; while ((item = r.take()) != POISON_URLEntry) { diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 311420a8a..6be3f8edb 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -31,15 +31,13 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Date; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.Properties; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import pt.tumba.parser.swf.SWF2HTML; public class swfParser extends AbstractParser implements Parser { @@ -58,7 +56,7 @@ public class swfParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { @@ -81,7 +79,7 @@ public class swfParser extends AbstractParser implements Parser { final String[] sections = null; final List abstrct = new ArrayList(); //TreeSet images = null; - final Map anchors = new HashMap(); + final List anchors = new ArrayList(); int urls = 0; int urlStart = -1; int urlEnd = 0; @@ -98,9 +96,9 @@ public class swfParser extends AbstractParser implements Parser { urlEnd = contents.indexOf(linebreak,urlStart); url = contents.substring(urlStart,urlEnd); urlnr = Integer.toString(++urls).toString(); - final Properties p = new Properties(); - p.put("name", urlnr); - anchors.put(new DigestURI(url), p); + AnchorURL u = new AnchorURL(url); + u.getProperties().put("name", urlnr); + anchors.add(u); contents = contents.substring(0,urlStart)+contents.substring(urlEnd); } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index df8f32f66..f70715218 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -33,13 +33,13 @@ import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import org.apache.tools.tar.TarEntry; @@ -61,11 +61,11 @@ public class tarParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final DigestURL url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { final List docacc = new ArrayList(); Document[] subDocs = null; - final String ext = MultiProtocolURI.getFileExtension(url.getFileName()).toLowerCase(); + final String ext = MultiProtocolURL.getFileExtension(url.getFileName()).toLowerCase(); if (ext.equals("gz") || ext.equals("tgz")) { try { source = new GZIPInputStream(source); @@ -90,7 +90,7 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(DigestURI.newURL(url, "#" + name), mime, null, tmp); + subDocs = TextParser.parseSource(DigestURL.newURL(url, "#" + name), mime, null, tmp); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index 4dcf33d4d..5c3ff5d1f 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -32,14 +32,14 @@ import java.util.Date; import java.util.List; import java.util.Map; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.util.BDecoder; import net.yacy.kelondro.util.BDecoder.BObject; @@ -56,7 +56,7 @@ public class torrentParser extends AbstractParser implements Parser { } @Override - public Document[] parse(DigestURI location, String mimeType, String charset, InputStream source) + public Document[] parse(DigestURL location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { byte[] b = null; try { @@ -93,7 +93,7 @@ public class torrentParser extends AbstractParser implements Parser { final BObject nameo = info.get("name"); if (nameo != null) title = UTF8.String(nameo.getString()); } - if (title == null || title.isEmpty()) title = MultiProtocolURI.unescape(location.getFileName()); + if (title == null || title.isEmpty()) title = MultiProtocolURL.unescape(location.getFileName()); return new Document[]{new Document( location, mimeType, @@ -119,7 +119,7 @@ public class torrentParser extends AbstractParser implements Parser { try { byte[] b = FileUtils.read(new File(args[0])); torrentParser parser = new torrentParser(); - Document[] d = parser.parse(new DigestURI("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b)); + Document[] d = parser.parse(new DigestURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b)); Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false); Map w = c.words(); for (Map.Entry e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 25726314c..99b46f281 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -38,14 +38,14 @@ import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; -import java.util.Properties; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; /** * Vcard specification: http://www.imc.org/pdi/vcard-21.txt @@ -65,14 +65,14 @@ public class vcfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI url, final String mimeType, final String charset, final InputStream source) + public Document[] parse(final DigestURL url, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { final StringBuilder parsedTitle = new StringBuilder(); final StringBuilder parsedDataText = new StringBuilder(); final HashMap parsedData = new HashMap(); - final HashMap anchors = new HashMap(); + final List anchors = new ArrayList(); final LinkedList parsedNames = new LinkedList(); boolean useLastLine = false; @@ -179,10 +179,9 @@ public class vcfParser extends AbstractParser implements Parser { parsedData.clear(); } else if (key.toUpperCase().startsWith("URL")) { try { - final DigestURI newURL = new DigestURI(value); - final Properties p = new Properties(); - p.put("name", newURL.toString()); - anchors.put(newURL, p); + final AnchorURL newURL = new AnchorURL(value); + newURL.getProperties().put("name", newURL.toString()); + anchors.add(newURL); //parsedData.put(key,value); } catch (final MalformedURLException ex) {/* ignore this */} } else if ( diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java index 6ea7b3885..cd392d28f 100644 --- a/source/net/yacy/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -32,11 +32,11 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpsf.SummaryInformation; @@ -66,7 +66,7 @@ public class vsdParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { Document theDoc = null; diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java index 850f4287a..ccd3d7d7e 100644 --- a/source/net/yacy/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -30,11 +30,11 @@ package net.yacy.document.parser; import java.io.InputStream; import java.util.Date; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; @@ -66,7 +66,7 @@ public class xlsParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { return new XLSHSSFListener().parse(location, mimeType, charset, source); @@ -86,7 +86,7 @@ public class xlsParser extends AbstractParser implements Parser { * parses the source documents and returns a Document containing * all extracted information about the parsed document */ - public Document[] parse(final DigestURI location, final String mimeType, + public Document[] parse(final DigestURL location, final String mimeType, @SuppressWarnings("unused") final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index 48695ae3a..aaaeb1527 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -32,11 +32,11 @@ import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; @@ -60,7 +60,7 @@ public class zipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURI url, final String mimeType, + public Document[] parse(final DigestURL url, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser @@ -86,7 +86,7 @@ public class zipParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(zis, tmp, entry.getSize()); - final DigestURI virtualURL = DigestURI.newURL(url, "#" + name); + final DigestURL virtualURL = DigestURL.newURL(url, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); docs = TextParser.parseSource(virtualURL, mime, null, tmp); if (docs == null) continue; diff --git a/source/net/yacy/interaction/AugmentHtmlStream.java b/source/net/yacy/interaction/AugmentHtmlStream.java index 4a7945789..eff2cd833 100644 --- a/source/net/yacy/interaction/AugmentHtmlStream.java +++ b/source/net/yacy/interaction/AugmentHtmlStream.java @@ -8,13 +8,13 @@ import java.io.InputStream; import java.io.StringReader; import java.net.URLEncoder; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.server.http.ServerSideIncludes; @@ -83,7 +83,7 @@ public class AugmentHtmlStream { } - public static StringBuilder process(StringBuilder data, DigestURI url, RequestHeader requestHeader) { + public static StringBuilder process(StringBuilder data, DigestURL url, RequestHeader requestHeader) { String action = requestHeader.get("YACYACTION"); requestHeader.remove("YACYACTION"); diff --git a/source/net/yacy/interaction/Interaction.java b/source/net/yacy/interaction/Interaction.java index ae5428b6d..1bf14f1b6 100644 --- a/source/net/yacy/interaction/Interaction.java +++ b/source/net/yacy/interaction/Interaction.java @@ -8,7 +8,8 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -17,7 +18,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.UserDB; import net.yacy.kelondro.blob.Tables.Row; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.peers.Seed; import net.yacy.search.Switchboard; @@ -86,7 +86,7 @@ public class Interaction { String domain = url; try { - DigestURI uri = new DigestURI (url); + DigestURL uri = new DigestURL (url); domain = uri.getHost(); } catch (final MalformedURLException e) { @@ -105,9 +105,9 @@ public class Interaction { String result = ""; - DigestURI uri; + DigestURL uri; try { - uri = new DigestURI (url); + uri = new DigestURL (url); result = UTF8.String(uri.hash()); diff --git a/source/net/yacy/kelondro/blob/ArrayStack.java b/source/net/yacy/kelondro/blob/ArrayStack.java index 6630d1b49..7f91a8a3e 100644 --- a/source/net/yacy/kelondro/blob/ArrayStack.java +++ b/source/net/yacy/kelondro/blob/ArrayStack.java @@ -50,8 +50,8 @@ import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.NaturalOrder; diff --git a/source/net/yacy/kelondro/blob/BEncodedHeap.java b/source/net/yacy/kelondro/blob/BEncodedHeap.java index 455c758f1..58b4867e3 100644 --- a/source/net/yacy/kelondro/blob/BEncodedHeap.java +++ b/source/net/yacy/kelondro/blob/BEncodedHeap.java @@ -36,8 +36,8 @@ import java.util.Set; import java.util.TreeSet; import java.util.regex.Pattern; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.CloneableIterator; diff --git a/source/net/yacy/kelondro/blob/BEncodedHeapBag.java b/source/net/yacy/kelondro/blob/BEncodedHeapBag.java index 5a9029737..3b7ae63c9 100644 --- a/source/net/yacy/kelondro/blob/BEncodedHeapBag.java +++ b/source/net/yacy/kelondro/blob/BEncodedHeapBag.java @@ -35,7 +35,7 @@ import java.util.List; import java.util.Map; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.CloneableIterator; diff --git a/source/net/yacy/kelondro/blob/BEncodedHeapShard.java b/source/net/yacy/kelondro/blob/BEncodedHeapShard.java index 8f3723587..180c46877 100644 --- a/source/net/yacy/kelondro/blob/BEncodedHeapShard.java +++ b/source/net/yacy/kelondro/blob/BEncodedHeapShard.java @@ -31,8 +31,8 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.CloneableIterator; diff --git a/source/net/yacy/kelondro/blob/Heap.java b/source/net/yacy/kelondro/blob/Heap.java index 019c77fb6..db4b4bdb6 100644 --- a/source/net/yacy/kelondro/blob/Heap.java +++ b/source/net/yacy/kelondro/blob/Heap.java @@ -34,7 +34,7 @@ import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.NaturalOrder; diff --git a/source/net/yacy/kelondro/blob/HeapModifier.java b/source/net/yacy/kelondro/blob/HeapModifier.java index 41db04c2d..1e0d14a20 100644 --- a/source/net/yacy/kelondro/blob/HeapModifier.java +++ b/source/net/yacy/kelondro/blob/HeapModifier.java @@ -28,7 +28,7 @@ import java.io.File; import java.io.IOException; import java.util.SortedMap; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; diff --git a/source/net/yacy/kelondro/blob/HeapReader.java b/source/net/yacy/kelondro/blob/HeapReader.java index 03f525127..ceacf2078 100644 --- a/source/net/yacy/kelondro/blob/HeapReader.java +++ b/source/net/yacy/kelondro/blob/HeapReader.java @@ -36,8 +36,8 @@ import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.ExecutionException; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.Digest; diff --git a/source/net/yacy/kelondro/blob/HeapWriter.java b/source/net/yacy/kelondro/blob/HeapWriter.java index 7aa3c970c..6ef001733 100644 --- a/source/net/yacy/kelondro/blob/HeapWriter.java +++ b/source/net/yacy/kelondro/blob/HeapWriter.java @@ -30,7 +30,7 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.storage.HandleMap; import net.yacy.cora.util.ConcurrentLog; diff --git a/source/net/yacy/kelondro/blob/MapColumnIndex.java b/source/net/yacy/kelondro/blob/MapColumnIndex.java index e3822cd7a..f99d1ce05 100644 --- a/source/net/yacy/kelondro/blob/MapColumnIndex.java +++ b/source/net/yacy/kelondro/blob/MapColumnIndex.java @@ -33,7 +33,7 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeMap; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.NaturalOrder; /** diff --git a/source/net/yacy/kelondro/blob/MapDataMining.java b/source/net/yacy/kelondro/blob/MapDataMining.java index 9c6fc7d6f..f82ac01ef 100644 --- a/source/net/yacy/kelondro/blob/MapDataMining.java +++ b/source/net/yacy/kelondro/blob/MapDataMining.java @@ -38,7 +38,7 @@ import java.util.Locale; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.CloneableIterator; diff --git a/source/net/yacy/kelondro/blob/MapHeap.java b/source/net/yacy/kelondro/blob/MapHeap.java index a69674eac..f0780e3a3 100644 --- a/source/net/yacy/kelondro/blob/MapHeap.java +++ b/source/net/yacy/kelondro/blob/MapHeap.java @@ -44,7 +44,7 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.NaturalOrder; diff --git a/source/net/yacy/kelondro/blob/Tables.java b/source/net/yacy/kelondro/blob/Tables.java index f8ee3d596..17c663162 100644 --- a/source/net/yacy/kelondro/blob/Tables.java +++ b/source/net/yacy/kelondro/blob/Tables.java @@ -44,8 +44,8 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; diff --git a/source/net/yacy/kelondro/blob/TablesColumnIndex.java b/source/net/yacy/kelondro/blob/TablesColumnIndex.java index 04f0ffd80..536e4e9f4 100644 --- a/source/net/yacy/kelondro/blob/TablesColumnIndex.java +++ b/source/net/yacy/kelondro/blob/TablesColumnIndex.java @@ -28,7 +28,7 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.NaturalOrder; diff --git a/source/net/yacy/kelondro/data/citation/CitationReference.java b/source/net/yacy/kelondro/data/citation/CitationReference.java index 0c2186d47..a72ed2fe8 100644 --- a/source/net/yacy/kelondro/data/citation/CitationReference.java +++ b/source/net/yacy/kelondro/data/citation/CitationReference.java @@ -24,7 +24,7 @@ import java.io.Serializable; import java.util.Collection; import net.yacy.cora.date.MicroDate; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.Base64Order; import net.yacy.cora.util.ByteArray; import net.yacy.kelondro.data.word.Word; diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index b893aaff9..3e3c73b42 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -33,11 +33,12 @@ import java.util.regex.Pattern; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.MicroDate; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.order.Base64Order; @@ -61,7 +62,7 @@ public class URIMetadataNode { private byte[] hash = null; private String urlRaw = null, keywords = null; - private DigestURI url = null; + private DigestURL url = null; private Bitfield flags = null; private int imagec = -1, audioc = -1, videoc = -1, appc = -1; private double lat = Double.NaN, lon = Double.NaN; @@ -79,7 +80,7 @@ public class URIMetadataNode { this.hash = ASCII.getBytes(getString(CollectionSchema.id)); this.urlRaw = getString(CollectionSchema.sku); try { - this.url = new DigestURI(this.urlRaw, this.hash); + this.url = new DigestURL(this.urlRaw, this.hash); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); this.url = null; @@ -110,7 +111,7 @@ public class URIMetadataNode { return getDate(CollectionSchema.last_modified); } - public DigestURI url() { + public DigestURL url() { return this.url; } @@ -202,7 +203,7 @@ public class URIMetadataNode { if (flags == null) { this.flags = new Bitfield(); if (dc_subject() != null && dc_subject().indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true); - ContentDomain cd = Classification.getContentDomain(MultiProtocolURI.getFileExtension(this.url().getFileName())); + ContentDomain cd = Classification.getContentDomain(MultiProtocolURL.getFileExtension(this.url().getFileName())); if (lon() != 0.0d || lat() != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true); if (cd == ContentDomain.IMAGE || limage() > 0) this.flags.set(Condenser.flag_cat_hasimage, true); if (cd == ContentDomain.AUDIO || laudio() > 0) this.flags.set(Condenser.flag_cat_hasaudio, true); diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index b1797de38..2bf071661 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -34,8 +34,9 @@ import java.util.Properties; import java.util.regex.Pattern; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; @@ -118,10 +119,10 @@ public class URIMetadataRow { // generates an plasmaLURLEntry using the properties from the argument // the property names must correspond to the one from toString //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); - DigestURI url; + DigestURL url; String urls = crypt.simpleDecode(prop.getProperty("url", "")); try { - url = new DigestURI(urls); + url = new DigestURL(urls); } catch (final MalformedURLException e) { throw new kelondroException("bad url: " + urls); } @@ -210,7 +211,7 @@ public class URIMetadataRow { } private static byte[] encodeComp( - final DigestURI url, + final DigestURL url, final String dc_title, final String dc_creator, final String dc_subject, @@ -252,7 +253,7 @@ public class URIMetadataRow { return this.metadata().matches(matcher); } - public DigestURI url() { + public DigestURL url() { return this.metadata().url(); } @@ -465,7 +466,7 @@ public class URIMetadataRow { } private class Components { - private DigestURI url; + private DigestURL url; private String urlRaw; private byte[] urlHash; private final String dc_title, dc_creator, dc_subject, dc_publisher; @@ -493,10 +494,10 @@ public class URIMetadataRow { if (this.url != null) return matcher.matcher(this.url.toNormalform(true).toLowerCase()).matches(); return false; } - public DigestURI url() { + public DigestURL url() { if (this.url == null) { try { - this.url = new DigestURI(this.urlRaw, this.urlHash); + this.url = new DigestURL(this.urlRaw, this.urlHash); } catch (final MalformedURLException e) { this.url = null; } diff --git a/source/net/yacy/kelondro/data/navigation/NavigationReferenceRow.java b/source/net/yacy/kelondro/data/navigation/NavigationReferenceRow.java index 18375ddbc..40bf60454 100644 --- a/source/net/yacy/kelondro/data/navigation/NavigationReferenceRow.java +++ b/source/net/yacy/kelondro/data/navigation/NavigationReferenceRow.java @@ -28,7 +28,7 @@ package net.yacy.kelondro.data.navigation; import java.util.Collection; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.Base64Order; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.Column; diff --git a/source/net/yacy/kelondro/data/navigation/NavigationReferenceVars.java b/source/net/yacy/kelondro/data/navigation/NavigationReferenceVars.java index e1771296c..a0bc1db86 100644 --- a/source/net/yacy/kelondro/data/navigation/NavigationReferenceVars.java +++ b/source/net/yacy/kelondro/data/navigation/NavigationReferenceVars.java @@ -29,7 +29,7 @@ package net.yacy.kelondro.data.navigation; import java.io.Serializable; import java.util.Collection; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.rwi.AbstractReference; import net.yacy.kelondro.rwi.Reference; diff --git a/source/net/yacy/kelondro/data/word/WordReferenceFactory.java b/source/net/yacy/kelondro/data/word/WordReferenceFactory.java index e29413ca5..a7db4c03a 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceFactory.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceFactory.java @@ -35,7 +35,7 @@ import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.util.ByteBuffer; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; diff --git a/source/net/yacy/kelondro/data/word/WordReferenceRow.java b/source/net/yacy/kelondro/data/word/WordReferenceRow.java index 74333e498..45614a3a8 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceRow.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceRow.java @@ -30,7 +30,7 @@ import java.util.ArrayList; import java.util.Collection; import net.yacy.cora.date.MicroDate; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.Base64Order; import net.yacy.cora.util.ByteArray; import net.yacy.kelondro.index.Column; diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index 8ecb4c17b..8cc47ecd9 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -33,9 +33,9 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.date.MicroDate; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.ConcurrentLog; @@ -83,7 +83,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc this.positions = new LinkedBlockingQueue(); this.positions.add(1); String urlNormalform = md.url().toNormalform(true); - this.urlcomps = MultiProtocolURI.urlComps(urlNormalform).length; + this.urlcomps = MultiProtocolURL.urlComps(urlNormalform).length; this.urllength = urlNormalform.length(); this.virtualAge = -1; // compute that later // the following fields cannot be computed here very easy and are just filled with dummy values diff --git a/source/net/yacy/kelondro/index/IndexTest.java b/source/net/yacy/kelondro/index/IndexTest.java index 8d180e87e..375b05b04 100644 --- a/source/net/yacy/kelondro/index/IndexTest.java +++ b/source/net/yacy/kelondro/index/IndexTest.java @@ -32,8 +32,8 @@ import java.util.HashMap; import java.util.Random; import java.util.TreeMap; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.HandleMap; import net.yacy.cora.util.ByteArray; diff --git a/source/net/yacy/kelondro/index/Row.java b/source/net/yacy/kelondro/index/Row.java index 391b0b3a5..85240805c 100644 --- a/source/net/yacy/kelondro/index/Row.java +++ b/source/net/yacy/kelondro/index/Row.java @@ -37,8 +37,8 @@ import java.util.StringTokenizer; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.AbstractOrder; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; diff --git a/source/net/yacy/kelondro/index/RowCollection.java b/source/net/yacy/kelondro/index/RowCollection.java index 056bc2b02..442d47c2e 100644 --- a/source/net/yacy/kelondro/index/RowCollection.java +++ b/source/net/yacy/kelondro/index/RowCollection.java @@ -33,8 +33,8 @@ import java.util.List; import java.util.Random; import java.util.concurrent.Callable; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.NaturalOrder; diff --git a/source/net/yacy/kelondro/index/RowHandleSet.java b/source/net/yacy/kelondro/index/RowHandleSet.java index 7c9fe34cf..49b5038ce 100644 --- a/source/net/yacy/kelondro/index/RowHandleSet.java +++ b/source/net/yacy/kelondro/index/RowHandleSet.java @@ -38,7 +38,7 @@ import java.io.Serializable; import java.util.Iterator; import java.util.Set; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.NaturalOrder; diff --git a/source/net/yacy/kelondro/index/RowSet.java b/source/net/yacy/kelondro/index/RowSet.java index 43b7e152a..d43ef65f1 100644 --- a/source/net/yacy/kelondro/index/RowSet.java +++ b/source/net/yacy/kelondro/index/RowSet.java @@ -34,8 +34,8 @@ import java.util.Map; import java.util.Random; import java.util.TreeMap; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.NaturalOrder; diff --git a/source/net/yacy/kelondro/io/AbstractWriter.java b/source/net/yacy/kelondro/io/AbstractWriter.java index 2ad3a8ce5..f0da9b959 100644 --- a/source/net/yacy/kelondro/io/AbstractWriter.java +++ b/source/net/yacy/kelondro/io/AbstractWriter.java @@ -32,7 +32,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ByteBuffer; diff --git a/source/net/yacy/kelondro/io/BufferedRecords.java b/source/net/yacy/kelondro/io/BufferedRecords.java index c98e2b417..68656adce 100644 --- a/source/net/yacy/kelondro/io/BufferedRecords.java +++ b/source/net/yacy/kelondro/io/BufferedRecords.java @@ -29,7 +29,7 @@ import java.io.IOException; import java.util.Map; import java.util.TreeMap; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.util.FileUtils; diff --git a/source/net/yacy/kelondro/io/CachedRecords.java b/source/net/yacy/kelondro/io/CachedRecords.java index 70366d0a0..6b98a7332 100644 --- a/source/net/yacy/kelondro/io/CachedRecords.java +++ b/source/net/yacy/kelondro/io/CachedRecords.java @@ -30,7 +30,7 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.RandomAccessFile; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; diff --git a/source/net/yacy/kelondro/io/CharBuffer.java b/source/net/yacy/kelondro/io/CharBuffer.java index 401e2da83..5edc4b54a 100644 --- a/source/net/yacy/kelondro/io/CharBuffer.java +++ b/source/net/yacy/kelondro/io/CharBuffer.java @@ -31,7 +31,7 @@ import java.io.IOException; import java.io.Writer; import java.util.Properties; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; public final class CharBuffer extends Writer { diff --git a/source/net/yacy/kelondro/io/Records.java b/source/net/yacy/kelondro/io/Records.java index 0f3d30959..02d139f92 100644 --- a/source/net/yacy/kelondro/io/Records.java +++ b/source/net/yacy/kelondro/io/Records.java @@ -30,7 +30,7 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.RandomAccessFile; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainer.java b/source/net/yacy/kelondro/rwi/ReferenceContainer.java index def34d070..d33cdec49 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainer.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainer.java @@ -35,7 +35,7 @@ import java.util.Iterator; import java.util.List; import java.util.TreeMap; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.storage.HandleSet; diff --git a/source/net/yacy/kelondro/rwi/ReferenceIterator.java b/source/net/yacy/kelondro/rwi/ReferenceIterator.java index 4ce6e0e7a..6acb1eeff 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceIterator.java +++ b/source/net/yacy/kelondro/rwi/ReferenceIterator.java @@ -30,7 +30,7 @@ import java.io.File; import java.io.IOException; import java.util.Map; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.LookAheadIterator; diff --git a/source/net/yacy/kelondro/table/Relations.java b/source/net/yacy/kelondro/table/Relations.java index 3a5d5ec2a..da2116c48 100644 --- a/source/net/yacy/kelondro/table/Relations.java +++ b/source/net/yacy/kelondro/table/Relations.java @@ -30,7 +30,7 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.NumberTools; diff --git a/source/net/yacy/kelondro/table/SQLTable.java b/source/net/yacy/kelondro/table/SQLTable.java index fa2f947ae..df8b9a5b6 100644 --- a/source/net/yacy/kelondro/table/SQLTable.java +++ b/source/net/yacy/kelondro/table/SQLTable.java @@ -39,8 +39,8 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.index.Index; diff --git a/source/net/yacy/kelondro/table/Table.java b/source/net/yacy/kelondro/table/Table.java index e0faf9577..fb2d9ef7d 100644 --- a/source/net/yacy/kelondro/table/Table.java +++ b/source/net/yacy/kelondro/table/Table.java @@ -38,7 +38,7 @@ import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.storage.HandleMap; diff --git a/source/net/yacy/kelondro/util/BDecoder.java b/source/net/yacy/kelondro/util/BDecoder.java index 4c86ae310..d12b0fd30 100644 --- a/source/net/yacy/kelondro/util/BDecoder.java +++ b/source/net/yacy/kelondro/util/BDecoder.java @@ -36,8 +36,8 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; public class BDecoder { diff --git a/source/net/yacy/kelondro/util/BEncoder.java b/source/net/yacy/kelondro/util/BEncoder.java index 28ed02c25..7dd7060e8 100644 --- a/source/net/yacy/kelondro/util/BEncoder.java +++ b/source/net/yacy/kelondro/util/BEncoder.java @@ -29,7 +29,7 @@ import java.io.IOException; import java.util.HashMap; import java.util.Map; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.kelondro.util.BDecoder.BObject; public class BEncoder { diff --git a/source/net/yacy/kelondro/util/FileUtils.java b/source/net/yacy/kelondro/util/FileUtils.java index ee210a882..4a637491a 100644 --- a/source/net/yacy/kelondro/util/FileUtils.java +++ b/source/net/yacy/kelondro/util/FileUtils.java @@ -53,7 +53,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.storage.Files; import net.yacy.cora.util.ConcurrentLog; diff --git a/source/net/yacy/kelondro/util/OS.java b/source/net/yacy/kelondro/util/OS.java index c600cd9c8..502d2e655 100644 --- a/source/net/yacy/kelondro/util/OS.java +++ b/source/net/yacy/kelondro/util/OS.java @@ -33,7 +33,7 @@ import java.util.Map; import java.util.Properties; import java.util.Vector; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.NumberTools; import net.yacy.server.serverCore; diff --git a/source/net/yacy/kelondro/util/SetTools.java b/source/net/yacy/kelondro/util/SetTools.java index 9cd054b42..3306e8372 100644 --- a/source/net/yacy/kelondro/util/SetTools.java +++ b/source/net/yacy/kelondro/util/SetTools.java @@ -43,7 +43,7 @@ import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; diff --git a/source/net/yacy/peers/Accessible.java b/source/net/yacy/peers/Accessible.java index 15f47d1d0..c3209bb63 100644 --- a/source/net/yacy/peers/Accessible.java +++ b/source/net/yacy/peers/Accessible.java @@ -30,7 +30,7 @@ package net.yacy.peers; import java.io.File; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; diff --git a/source/net/yacy/peers/DHTSelection.java b/source/net/yacy/peers/DHTSelection.java index 268357814..cfc05a1d7 100644 --- a/source/net/yacy/peers/DHTSelection.java +++ b/source/net/yacy/peers/DHTSelection.java @@ -36,7 +36,7 @@ import java.util.SortedMap; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.federate.yacy.Distribution; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; diff --git a/source/net/yacy/peers/Dispatcher.java b/source/net/yacy/peers/Dispatcher.java index b206ba3cc..427549e88 100644 --- a/source/net/yacy/peers/Dispatcher.java +++ b/source/net/yacy/peers/Dispatcher.java @@ -32,7 +32,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.federate.yacy.Distribution; import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.HandleSet; diff --git a/source/net/yacy/peers/EventChannel.java b/source/net/yacy/peers/EventChannel.java index c21a7d151..25ac25f9b 100644 --- a/source/net/yacy/peers/EventChannel.java +++ b/source/net/yacy/peers/EventChannel.java @@ -29,8 +29,8 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSMessage; public enum EventChannel { TEST, diff --git a/source/net/yacy/peers/Network.java b/source/net/yacy/peers/Network.java index 1d2b47f0b..2b4935205 100644 --- a/source/net/yacy/peers/Network.java +++ b/source/net/yacy/peers/Network.java @@ -50,12 +50,12 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Semaphore; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.Domains; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.peers.operation.yacySeedUploadFile; import net.yacy.peers.operation.yacySeedUploadFtp; import net.yacy.peers.operation.yacySeedUploadScp; @@ -753,7 +753,7 @@ public class Network } // ensure that the seed file url is configured properly - DigestURI seedURL; + DigestURL seedURL; try { final String seedURLStr = sb.peers.mySeed().get(Seed.SEEDLISTURL, ""); if ( seedURLStr.isEmpty() ) { @@ -763,7 +763,7 @@ public class Network "https://")) ) { throw new MalformedURLException("Unsupported protocol."); } - seedURL = new DigestURI(seedURLStr); + seedURL = new DigestURL(seedURLStr); final String host = seedURL.getHost(); if (Domains.isLocalhost(host)) { // check seedlist reacheable final String errorMsg = "seedURL in localhost rejected (localhost can't be reached from outside)"; diff --git a/source/net/yacy/peers/NewsDB.java b/source/net/yacy/peers/NewsDB.java index a01ac8a51..cef2912a4 100644 --- a/source/net/yacy/peers/NewsDB.java +++ b/source/net/yacy/peers/NewsDB.java @@ -54,7 +54,7 @@ import java.util.Map.Entry; import java.util.Properties; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; diff --git a/source/net/yacy/peers/NewsPool.java b/source/net/yacy/peers/NewsPool.java index 0be6eeac3..7d946359a 100644 --- a/source/net/yacy/peers/NewsPool.java +++ b/source/net/yacy/peers/NewsPool.java @@ -53,9 +53,9 @@ import java.util.Map; import java.util.Properties; import java.util.Set; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; @@ -343,13 +343,13 @@ public class NewsPool { if (record.created().getTime() == 0) return; final Map attributes = record.attributes(); if (attributes.containsKey("url")){ - if (Switchboard.urlBlacklist.isListed(BlacklistType.NEWS, new DigestURI(attributes.get("url")))){ + if (Switchboard.urlBlacklist.isListed(BlacklistType.NEWS, new DigestURL(attributes.get("url")))){ System.out.println("DEBUG: ignored news-entry url blacklisted: " + attributes.get("url")); return; } } if (attributes.containsKey("startURL")){ - if (Switchboard.urlBlacklist.isListed(BlacklistType.NEWS, new DigestURI(attributes.get("startURL")))){ + if (Switchboard.urlBlacklist.isListed(BlacklistType.NEWS, new DigestURL(attributes.get("startURL")))){ System.out.println("DEBUG: ignored news-entry url blacklisted: " + attributes.get("startURL")); return; } diff --git a/source/net/yacy/peers/NewsQueue.java b/source/net/yacy/peers/NewsQueue.java index f06bc8f98..6d37ed7f9 100644 --- a/source/net/yacy/peers/NewsQueue.java +++ b/source/net/yacy/peers/NewsQueue.java @@ -50,7 +50,7 @@ import java.util.HashSet; import java.util.Iterator; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; diff --git a/source/net/yacy/peers/PeerActions.java b/source/net/yacy/peers/PeerActions.java index 0ce06780b..c7bc831b4 100644 --- a/source/net/yacy/peers/PeerActions.java +++ b/source/net/yacy/peers/PeerActions.java @@ -26,8 +26,8 @@ package net.yacy.peers; import java.util.Map; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.storage.ConcurrentARC; import net.yacy.kelondro.util.MapTools; import net.yacy.peers.operation.yacyVersion; diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index fe11e4512..d2732b95d 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -59,17 +59,13 @@ import java.util.TreeMap; import net.yacy.migration; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.JSONArray; -import net.yacy.cora.document.JSONException; -import net.yacy.cora.document.JSONObject; -import net.yacy.cora.document.JSONTokener; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.RSSMessage; -import net.yacy.cora.document.RSSReader; -import net.yacy.cora.document.UTF8; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.feed.RSSReader; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.opensearch.SRURSSConnector; import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; @@ -85,6 +81,10 @@ import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.cora.util.JSONArray; +import net.yacy.cora.util.JSONException; +import net.yacy.cora.util.JSONObject; +import net.yacy.cora.util.JSONTokener; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; @@ -154,7 +154,7 @@ public final class Protocol { final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent); httpClient.setTimout(timeout); return httpClient.POSTbytes( - new MultiProtocolURI("http://" + targetAddress + "/yacy/" + filename), + new MultiProtocolURL("http://" + targetAddress + "/yacy/" + filename), Seed.b64Hash2hexHash(targetPeerHash) + ".yacyh", parts, false); @@ -197,7 +197,7 @@ public final class Protocol { final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 30000); content = httpClient.POSTbytes( - new MultiProtocolURI("http://" + address + "/yacy/hello.html"), + new MultiProtocolURL("http://" + address + "/yacy/hello.html"), Seed.b64Hash2hexHash(otherHash) + ".yacyh", parts, false); @@ -517,7 +517,7 @@ public final class Protocol { // final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/urls.xml"), (int) maxTime, target.getHexHash() + ".yacyh", parts); final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, (int) maxTime); final byte[] result = - httpClient.POSTbytes(new MultiProtocolURI("http://" + httpClient.POSTbytes(new MultiProtocolURL("http://" + target.getClusterAddress() + "/yacy/urls.xml"), target.getHexHash() + ".yacyh", parts, false); final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result); @@ -938,7 +938,7 @@ public final class Protocol { } final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 8000); - byte[] a = httpClient.POSTbytes(new MultiProtocolURI("http://" + hostaddress + "/yacy/search.html"), hostname, parts, false); + byte[] a = httpClient.POSTbytes(new MultiProtocolURL("http://" + hostaddress + "/yacy/search.html"), hostname, parts, false); if (a != null && a.length > 200000) { // there is something wrong. This is too large, maybe a hack on the other side? a = null; @@ -1294,7 +1294,7 @@ public final class Protocol { final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 10000); final byte[] content = httpClient.POSTbytes( - new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), + new MultiProtocolURL("http://" + address + "/yacy/crawlReceipt.html"), target.getHexHash() + ".yacyh", parts, false); @@ -1473,7 +1473,7 @@ public final class Protocol { final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, timeout); final byte[] content = httpClient.POSTbytes( - new MultiProtocolURI("http://" + address + "/yacy/transferRWI.html"), + new MultiProtocolURL("http://" + address + "/yacy/transferRWI.html"), targetSeed.getHexHash() + ".yacyh", parts, gzipBody); @@ -1531,7 +1531,7 @@ public final class Protocol { final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, timeout); final byte[] content = httpClient.POSTbytes( - new MultiProtocolURI("http://" + address + "/yacy/transferURL.html"), + new MultiProtocolURL("http://" + address + "/yacy/transferURL.html"), targetSeed.getHexHash() + ".yacyh", parts, gzipBody); @@ -1564,7 +1564,7 @@ public final class Protocol { final HTTPClient httpclient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 15000); final byte[] content = httpclient.POSTbytes( - new MultiProtocolURI("http://" + address + "/yacy/profile.html"), + new MultiProtocolURL("http://" + address + "/yacy/profile.html"), targetSeed.getHexHash() + ".yacyh", parts, false); diff --git a/source/net/yacy/peers/RemoteSearch.java b/source/net/yacy/peers/RemoteSearch.java index 73e468d27..53685298e 100644 --- a/source/net/yacy/peers/RemoteSearch.java +++ b/source/net/yacy/peers/RemoteSearch.java @@ -32,7 +32,7 @@ import java.util.SortedMap; import org.apache.solr.client.solrj.SolrQuery; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.repository.Blacklist; diff --git a/source/net/yacy/peers/Seed.java b/source/net/yacy/peers/Seed.java index 0765a14f2..82e45db1e 100644 --- a/source/net/yacy/peers/Seed.java +++ b/source/net/yacy/peers/Seed.java @@ -60,8 +60,8 @@ import java.util.regex.Pattern; import net.yacy.cora.date.AbstractFormatter; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.federate.yacy.Distribution; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; diff --git a/source/net/yacy/peers/SeedDB.java b/source/net/yacy/peers/SeedDB.java index 06d7e9f07..85cd90f68 100644 --- a/source/net/yacy/peers/SeedDB.java +++ b/source/net/yacy/peers/SeedDB.java @@ -38,8 +38,9 @@ import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.Distribution; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; @@ -50,7 +51,6 @@ import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.blob.MapDataMining; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.kelondroException; @@ -760,7 +760,7 @@ public final class SeedDB implements AlternativeDomainNames { protected String uploadSeedList(final yacySeedUploader uploader, final serverSwitch sb, final SeedDB seedDB, - final DigestURI seedURL) throws Exception { + final DigestURL seedURL) throws Exception { // upload a seed file, if possible if (seedURL == null) throw new NullPointerException("UPLOAD - Error: URL not given"); @@ -801,7 +801,7 @@ public final class SeedDB implements AlternativeDomainNames { return log; } - private static Iterator downloadSeedFile(final DigestURI seedURL) throws IOException { + private static Iterator downloadSeedFile(final DigestURL seedURL) throws IOException { // Configure http headers final RequestHeader reqHeader = new RequestHeader(); reqHeader.put(HeaderFramework.PRAGMA, "no-cache"); diff --git a/source/net/yacy/peers/Transmission.java b/source/net/yacy/peers/Transmission.java index 9beafd8c3..47799fd7c 100644 --- a/source/net/yacy/peers/Transmission.java +++ b/source/net/yacy/peers/Transmission.java @@ -32,7 +32,7 @@ import java.util.List; import java.util.Random; import java.util.Set; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; diff --git a/source/net/yacy/peers/graphics/NetworkGraph.java b/source/net/yacy/peers/graphics/NetworkGraph.java index e60301e27..ef9e2f495 100644 --- a/source/net/yacy/peers/graphics/NetworkGraph.java +++ b/source/net/yacy/peers/graphics/NetworkGraph.java @@ -34,9 +34,9 @@ import java.util.Date; import java.util.Iterator; import java.util.List; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Hit; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.Hit; import net.yacy.cora.federate.yacy.Distribution; import net.yacy.cora.util.ConcurrentLog; import net.yacy.peers.EventChannel; diff --git a/source/net/yacy/peers/graphics/OSMTile.java b/source/net/yacy/peers/graphics/OSMTile.java index b120ecd48..49381b77a 100644 --- a/source/net/yacy/peers/graphics/OSMTile.java +++ b/source/net/yacy/peers/graphics/OSMTile.java @@ -34,12 +34,12 @@ import java.util.List; import javax.imageio.ImageIO; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.Cache; import net.yacy.crawler.retrieval.Response; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.visualization.RasterPlotter; @@ -101,9 +101,9 @@ public class OSMTile { } public static BufferedImage getSingleTile(final tileCoordinates tile, final int retry) { - DigestURI tileURL; + DigestURL tileURL; try { - tileURL = new DigestURI(tile.url(retry)); + tileURL = new DigestURL(tile.url(retry)); } catch (final MalformedURLException e) { return null; } diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index aa7a23c47..8dc512ce5 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -45,8 +45,9 @@ import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.MicroDate; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; @@ -54,7 +55,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.LookAheadIterator; import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.Document; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.rwi.AbstractReference; @@ -81,10 +81,10 @@ public class WebStructureGraph { private final static LearnObject leanrefObjectPOISON = new LearnObject(null, null); private static class LearnObject { - private final DigestURI url; - private final Set globalRefURLs; + private final DigestURL url; + private final Set globalRefURLs; - private LearnObject(final DigestURI url, final Set globalRefURLs) { + private LearnObject(final DigestURL url, final Set globalRefURLs) { this.url = url; this.globalRefURLs = globalRefURLs; } @@ -157,13 +157,13 @@ public class WebStructureGraph { this.structure_new.clear(); } - public void generateCitationReference(final DigestURI url, final Document document) { + public void generateCitationReference(final DigestURL url, final Document document) { // generate citation reference - final Map hl = document.getHyperlinks(); - final Iterator it = hl.keySet().iterator(); - final HashSet globalRefURLs = new HashSet(); + final Map hl = document.getHyperlinks(); + final Iterator it = hl.keySet().iterator(); + final HashSet globalRefURLs = new HashSet(); final String refhost = url.getHost(); - DigestURI u; + DigestURL u; int maxref = 1000; while ( it.hasNext() && maxref-- > 0 ) { u = it.next(); @@ -189,8 +189,8 @@ public class WebStructureGraph { } } - public void generateCitationReference(final DigestURI from, final DigestURI to) { - final HashSet globalRefURLs = new HashSet(); + public void generateCitationReference(final DigestURL from, final DigestURL to) { + final HashSet globalRefURLs = new HashSet(); final String refhost = from.getHost(); if (refhost != null && to.getHost() != null && !to.getHost().equals(refhost)) globalRefURLs.add(to); final LearnObject lro = new LearnObject(from, globalRefURLs); @@ -586,7 +586,7 @@ public class WebStructureGraph { private void learnrefs(final LearnObject lro) { final Set refhosts = new HashSet(); String hosthash; - for ( final DigestURI u : lro.globalRefURLs ) { + for ( final DigestURL u : lro.globalRefURLs ) { if (Switchboard.getSwitchboard().shallTerminate()) break; hosthash = ASCII.String(u.hash(), 6, 6); if (!exists(hosthash)) { @@ -597,7 +597,7 @@ public class WebStructureGraph { } refhosts.add(hosthash); } - final DigestURI url = lro.url; + final DigestURL url = lro.url; hosthash = ASCII.String(url.hash(), 6, 6); // parse the new reference string and join it with the stored references diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index 703b1eb11..95f0c4a35 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -37,15 +37,17 @@ import java.security.NoSuchAlgorithmException; import java.security.PublicKey; import java.security.SignatureException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; @@ -55,7 +57,6 @@ import net.yacy.cora.storage.Files; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Document; import net.yacy.document.parser.tarParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.OS; @@ -75,17 +76,17 @@ public final class yacyRelease extends yacyVersion { public final static List latestReleaseLocations = new ArrayList(); // will be initialized with value in defaults/yacy.network.freeworld.unit public static String startParameter = ""; - private MultiProtocolURI url; + private MultiProtocolURL url; private File releaseFile; private PublicKey publicKey; - public yacyRelease(final MultiProtocolURI url) { + public yacyRelease(final MultiProtocolURL url) { super(url.getFileName(), url.getHost()); this.url = url; } - private yacyRelease(final MultiProtocolURI url, final PublicKey publicKey) { + private yacyRelease(final MultiProtocolURL url, final PublicKey publicKey) { this(url); this.publicKey = publicKey; } @@ -95,7 +96,7 @@ public final class yacyRelease extends yacyVersion { this.releaseFile = releaseFile; } - public MultiProtocolURI getUrl() { + public MultiProtocolURL getUrl() { return this.url; } @@ -236,7 +237,7 @@ public final class yacyRelease extends yacyVersion { // returns the version info if successful, null otherwise Document scraper; try { - final DigestURI uri = location.getLocationURL(); + final DigestURL uri = location.getLocationURL(); Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, ClientIdentification.yacyInternetCrawlerAgent); } catch (final IOException e) { @@ -244,10 +245,10 @@ public final class yacyRelease extends yacyVersion { } // analyze links in scraper resource, and find link to latest release in it - final Map anchors = scraper.getAnchors(); // a url (String) / name (String) relation + final Collection anchors = scraper.getAnchors(); // a url (String) / name (String) relation final TreeSet mainReleases = new TreeSet(); final TreeSet devReleases = new TreeSet(); - for (final DigestURI url : anchors.keySet()) { + for (final DigestURL url : anchors) { try { final yacyRelease release = new yacyRelease(url, location.getPublicKey()); //System.out.println("r " + release.toAnchor()); diff --git a/source/net/yacy/peers/operation/yacySeedUploadScp.java b/source/net/yacy/peers/operation/yacySeedUploadScp.java index bb8ba77fc..f4c086e5a 100644 --- a/source/net/yacy/peers/operation/yacySeedUploadScp.java +++ b/source/net/yacy/peers/operation/yacySeedUploadScp.java @@ -32,7 +32,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.server.serverSwitch; import com.jcraft.jsch.Channel; diff --git a/source/net/yacy/peers/operation/yacyUpdateLocation.java b/source/net/yacy/peers/operation/yacyUpdateLocation.java index 5e2c62bee..4e3cad907 100644 --- a/source/net/yacy/peers/operation/yacyUpdateLocation.java +++ b/source/net/yacy/peers/operation/yacyUpdateLocation.java @@ -29,7 +29,7 @@ package net.yacy.peers.operation; import java.security.PublicKey; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.id.DigestURL; /** @@ -37,15 +37,15 @@ import net.yacy.kelondro.data.meta.DigestURI; * */ public class yacyUpdateLocation { - private final DigestURI locationURL; + private final DigestURL locationURL; private final PublicKey publicKey; - public yacyUpdateLocation(DigestURI locationURL, PublicKey publicKey) { + public yacyUpdateLocation(DigestURL locationURL, PublicKey publicKey) { this.locationURL = locationURL; this.publicKey = publicKey; } - public DigestURI getLocationURL() { + public DigestURL getLocationURL() { return this.locationURL; } public PublicKey getPublicKey() { diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index 1129d2e1b..e4105b89c 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -45,11 +45,11 @@ import java.util.concurrent.ConcurrentMap; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ListManager; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.RowHandleSet; @@ -405,7 +405,7 @@ public class Blacklist { * @param url Entry to be checked * @return Whether the given entry is blacklisted */ - public final boolean isListed(final BlacklistType blacklistType, final DigestURI url) { + public final boolean isListed(final BlacklistType blacklistType, final DigestURL url) { if (url == null) { throw new IllegalArgumentException("url may not be null"); } diff --git a/source/net/yacy/repository/FilterEngine.java b/source/net/yacy/repository/FilterEngine.java index be34989f5..51037b6a2 100644 --- a/source/net/yacy/repository/FilterEngine.java +++ b/source/net/yacy/repository/FilterEngine.java @@ -11,9 +11,9 @@ import java.util.TreeSet; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.storage.HashARC; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.DigestURI; /** * a URL filter engine for black and white lists @@ -51,7 +51,7 @@ public class FilterEngine { } } - protected HashARC> cachedUrlHashs = null; + protected HashARC> cachedUrlHashs = null; protected Map> hostpaths_matchable = null; protected Map> hostpaths_notmatchable = null; @@ -60,7 +60,7 @@ public class FilterEngine { // prepare the data structure this.hostpaths_matchable = new HashMap>(); this.hostpaths_notmatchable = new HashMap>(); - this.cachedUrlHashs = new HashARC>(CACHE_SIZE); + this.cachedUrlHashs = new HashARC>(CACHE_SIZE); } public void clear() { @@ -112,7 +112,7 @@ public class FilterEngine { this.hostpaths_notmatchable.remove(host); } - public boolean isListed(final DigestURI url, final EnumSet type) { + public boolean isListed(final DigestURL url, final EnumSet type) { // trival anwser if (url.getHost() == null) return false; diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index e92af6cef..776383dc3 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -38,8 +38,9 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; @@ -58,7 +59,6 @@ import net.yacy.crawler.retrieval.SMBLoader; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; @@ -74,7 +74,7 @@ public final class LoaderDispatcher { private final FTPLoader ftpLoader; private final SMBLoader smbLoader; private final FileLoader fileLoader; - private final ConcurrentHashMap loaderSteering; // a map that delivers a 'finish' semaphore for urls + private final ConcurrentHashMap loaderSteering; // a map that delivers a 'finish' semaphore for urls private final ConcurrentLog log; public LoaderDispatcher(final Switchboard sb) { @@ -87,7 +87,7 @@ public final class LoaderDispatcher { this.ftpLoader = new FTPLoader(sb, this.log); this.smbLoader = new SMBLoader(sb, this.log); this.fileLoader = new FileLoader(sb, this.log); - this.loaderSteering = new ConcurrentHashMap(); + this.loaderSteering = new ConcurrentHashMap(); } public boolean isSupportedProtocol(final String protocol) { @@ -108,7 +108,7 @@ public final class LoaderDispatcher { * @return the request object */ public Request request( - final DigestURI url, + final DigestURL url, final boolean forText, final boolean global ) { @@ -132,7 +132,7 @@ public final class LoaderDispatcher { 0); } - public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException { + public void load(final DigestURL url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException { final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType, agent).getContent(); if (b == null) throw new IOException("load == null"); @@ -182,7 +182,7 @@ public final class LoaderDispatcher { */ private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException { // get the protocol of the next URL - final DigestURI url = request.url(); + final DigestURL url = request.url(); if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system final String protocol = url.getProtocol(); final String host = url.getHost(); @@ -207,7 +207,7 @@ public final class LoaderDispatcher { // in case that we want to return the cached content in the next step final RequestHeader requestHeader = new RequestHeader(); requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent); - DigestURI refererURL = null; + DigestURL refererURL = null; if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash()); if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true)); final Response response = new Response( @@ -317,7 +317,7 @@ public final class LoaderDispatcher { return response; } - private int protocolMaxFileSize(final DigestURI url) { + private int protocolMaxFileSize(final DigestURL url) { if (url.isHTTP() || url.isHTTPS()) return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); if (url.isFTP()) @@ -348,7 +348,7 @@ public final class LoaderDispatcher { // load resource final Response response = load(request, cacheStrategy, maxFileSize, blacklistType, agent); - final DigestURI url = request.url(); + final DigestURL url = request.url(); if (response == null) throw new IOException("no Response for url " + url); // if it is still not available, report an error @@ -358,11 +358,11 @@ public final class LoaderDispatcher { return response.parse(); } - public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { + public Document loadDocument(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { // load resource Request request = request(location, true, false); final Response response = this.load(request, cachePolicy, blacklistType, agent); - final DigestURI url = request.url(); + final DigestURL url = request.url(); if (response == null) throw new IOException("no Response for url " + url); // if it is still not available, report an error @@ -384,7 +384,7 @@ public final class LoaderDispatcher { * @return a map from URLs to the anchor texts of the urls * @throws IOException */ - public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { + public final Map loadLinks(final DigestURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); @@ -414,24 +414,24 @@ public final class LoaderDispatcher { } } - public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) { + public void loadIfNotExistBackground(final DigestURL url, final File cache, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) { new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType, agent).start(); } - public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) { + public void loadIfNotExistBackground(final DigestURL url, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) { new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType, agent).start(); } private class Loader extends Thread { - private final DigestURI url; + private final DigestURL url; private final File cache; private final int maxFileSize; private final CacheStrategy cacheStrategy; private final BlacklistType blacklistType; private final ClientIdentification.Agent agent; - public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) { + public Loader(final DigestURL url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) { this.url = url; this.cache = cache; this.maxFileSize = maxFileSize; diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 2fe0affdf..bb2a8b74c 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -87,14 +87,15 @@ import org.apache.solr.common.SolrInputDocument; import net.yacy.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.contentcontrol.SMWListSyncThread; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.RSSMessage; -import net.yacy.cora.document.RSSReader; -import net.yacy.cora.document.UTF8; import net.yacy.cora.document.WordCache; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.feed.RSSReader; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.instance.RemoteInstance; @@ -155,7 +156,6 @@ import net.yacy.document.parser.pdfParser; import net.yacy.document.parser.html.Evaluation; import net.yacy.gui.Tray; import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.rwi.ReferenceContainer; @@ -376,7 +376,7 @@ public final class Switchboard extends serverSwitch { // init sessionid name file final String sessionidNamesFile = getConfig("sessionidNamesFile", "defaults/sessionid.names"); this.log.config("Loading sessionid file " + sessionidNamesFile); - MultiProtocolURI.initSessionIDNames(FileUtils.loadList(new File(getAppPath(), sessionidNamesFile))); + MultiProtocolURL.initSessionIDNames(FileUtils.loadList(new File(getAppPath(), sessionidNamesFile))); // init tables this.tables = new WorkTables(this.workPath); @@ -1177,10 +1177,10 @@ public final class Switchboard extends serverSwitch { if ( location.isEmpty() ) { break; } - DigestURI locationURL; + DigestURL locationURL; try { // try to parse url - locationURL = new DigestURI(location); + locationURL = new DigestURL(location); } catch (final MalformedURLException e ) { break; } @@ -1588,10 +1588,10 @@ public final class Switchboard extends serverSwitch { this.crawlQueues.removeURL(hash); } - public DigestURI getURL(final byte[] urlhash) { + public DigestURL getURL(final byte[] urlhash) { if (urlhash == null) return null; if (urlhash.length == 0) return null; - final DigestURI url = this.index.fulltext().getURL(urlhash); + final DigestURL url = this.index.fulltext().getURL(urlhash); if (url != null) return url; return this.crawlQueues.getURL(urlhash); } @@ -1788,7 +1788,7 @@ public final class Switchboard extends serverSwitch { // in the noIndexReason is set, indexing is not allowed if ( noIndexReason != null ) { // log cause and close queue - final DigestURI referrerURL = response.referrerURL(); + final DigestURL referrerURL = response.referrerURL(); //if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason); addURLtoErrorDB( response.url(), @@ -2511,7 +2511,7 @@ public final class Switchboard extends serverSwitch { ) ) { // get the hyperlinks - final Map hl = Document.getHyperlinks(documents); + final Map hl = Document.getHyperlinks(documents); boolean loadImages = getConfigBool("crawler.load.image", true); if (loadImages) hl.putAll(Document.getImagelinks(documents)); @@ -2524,8 +2524,8 @@ public final class Switchboard extends serverSwitch { } // insert those hyperlinks to the crawler - MultiProtocolURI nextUrl; - for ( final Map.Entry nextEntry : hl.entrySet() ) { + MultiProtocolURL nextUrl; + for ( final Map.Entry nextEntry : hl.entrySet() ) { // check for interruption checkInterruption(); @@ -2550,7 +2550,7 @@ public final class Switchboard extends serverSwitch { try { this.crawlStacker.enqueueEntry(new Request( response.initiator(), - new DigestURI(u), + new DigestURL(u), response.url().hash(), nextEntry.getValue(), new Date(), @@ -2697,8 +2697,8 @@ public final class Switchboard extends serverSwitch { // CREATE INDEX final String dc_title = document.dc_title(); - final DigestURI url = document.dc_source(); - final DigestURI referrerURL = queueEntry.referrerURL(); + final DigestURL url = document.dc_source(); + final DigestURL referrerURL = queueEntry.referrerURL(); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); CrawlProfile profile = queueEntry.profile(); @@ -2757,7 +2757,7 @@ public final class Switchboard extends serverSwitch { feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url(), ASCII.String(queueEntry.url().hash()))); // store rss feeds in document into rss table - for ( final Map.Entry rssEntry : document.getRSS().entrySet() ) { + for ( final Map.Entry rssEntry : document.getRSS().entrySet() ) { final Tables.Data rssRow = new Tables.Data(); rssRow.put("referrer", url.hash()); rssRow.put("url", UTF8.getBytes(rssEntry.getKey().toNormalform(true))); @@ -2806,29 +2806,29 @@ public final class Switchboard extends serverSwitch { } public final void addAllToIndex( - final DigestURI url, - final Map links, + final DigestURL url, + final Map links, final SearchEvent searchEvent, final String heuristicName, final Map collections) { - List urls = new ArrayList(); + List urls = new ArrayList(); // add the landing page to the index. should not load that again since it should be in the cache if (url != null) { urls.add(url); } // check if some of the links match with the query - final Map matcher = searchEvent.query.separateMatches(links); + final Map matcher = searchEvent.query.separateMatches(links); // take the matcher and load them all - for (final Map.Entry entry : matcher.entrySet()) { - urls.add(new DigestURI(entry.getKey(), (byte[]) null)); + for (final Map.Entry entry : matcher.entrySet()) { + urls.add(new DigestURL(entry.getKey(), (byte[]) null)); } // take then the no-matcher and load them also - for (final Map.Entry entry : links.entrySet()) { - urls.add(new DigestURI(entry.getKey(), (byte[]) null)); + for (final Map.Entry entry : links.entrySet()) { + urls.add(new DigestURL(entry.getKey(), (byte[]) null)); } addToIndex(urls, searchEvent, heuristicName, collections); } @@ -2845,11 +2845,11 @@ public final class Switchboard extends serverSwitch { this.crawlQueues.removeURL(urlhash); } - public void stackURLs(Set rootURLs, final CrawlProfile profile, final Set successurls, final Map failurls) { + public void stackURLs(Set rootURLs, final CrawlProfile profile, final Set successurls, final Map failurls) { if (rootURLs == null || rootURLs.size() == 0) return; List stackthreads = new ArrayList(); // do this concurrently - for (DigestURI url: rootURLs) { - final DigestURI turl = url; + for (DigestURL url: rootURLs) { + final DigestURL turl = url; Thread t = new Thread() { public void run() { String failreason; @@ -2870,7 +2870,7 @@ public final class Switchboard extends serverSwitch { * @param url * @return null if this was ok. If this failed, return a string with a fail reason */ - public String stackUrl(CrawlProfile profile, DigestURI url) { + public String stackUrl(CrawlProfile profile, DigestURL url) { byte[] handle = ASCII.getBytes(profile.handle()); @@ -2986,15 +2986,15 @@ public final class Switchboard extends serverSwitch { * @throws IOException * @throws Parser.Failure */ - public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections) { - Map urlmap = new HashMap(); - for (DigestURI url: urls) urlmap.put(ASCII.String(url.hash()), url); + public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections) { + Map urlmap = new HashMap(); + for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url); if (searchEvent != null) { for (String id: urlmap.keySet()) searchEvent.addHeuristic(ASCII.getBytes(id), heuristicName, true); } final Set existing = this.index.exists(urlmap.keySet()); final List requests = new ArrayList(); - for (Map.Entry e: urlmap.entrySet()) { + for (Map.Entry e: urlmap.entrySet()) { final String urlName = e.getValue().toNormalform(true); if (existing.contains(e.getKey())) { this.log.info("addToIndex: double " + urlName); @@ -3014,7 +3014,7 @@ public final class Switchboard extends serverSwitch { @Override public void run() { for (Request request: requests) { - DigestURI url = request.url(); + DigestURL url = request.url(); String urlName = url.toNormalform(true); Thread.currentThread().setName("Switchboard.addToIndex:" + urlName); try { @@ -3066,13 +3066,13 @@ public final class Switchboard extends serverSwitch { * @param url the url that shall be indexed * @param asglobal true adds the url to global crawl queue (for remote crawling), false to the local crawler */ - public void addToCrawler(final Collection urls, final boolean asglobal) { - Map urlmap = new HashMap(); - for (DigestURI url: urls) urlmap.put(ASCII.String(url.hash()), url); + public void addToCrawler(final Collection urls, final boolean asglobal) { + Map urlmap = new HashMap(); + for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url); Set existingids = this.index.exists(urlmap.keySet()); - for (Map.Entry e: urlmap.entrySet()) { + for (Map.Entry e: urlmap.entrySet()) { if (existingids.contains(e.getKey())) continue; // double - DigestURI url = e.getValue(); + DigestURL url = e.getValue(); final Request request = this.loader.request(url, true, true); final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle())); final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0); @@ -3373,7 +3373,7 @@ public final class Switchboard extends serverSwitch { } private void addURLtoErrorDB( - final DigestURI url, + final DigestURL url, final CrawlProfile profile, final byte[] referrerHash, final byte[] initiator, @@ -3408,20 +3408,20 @@ public final class Switchboard extends serverSwitch { } // get the links for a specific site - DigestURI url; + DigestURL url; try { - url = new DigestURI(r); + url = new DigestURL(r); } catch (final MalformedURLException e ) { ConcurrentLog.logException(e); return; } - final Map links; + final Map links; searchEvent.oneFeederStarted(); try { links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); if ( links != null ) { - final Iterator i = links.keySet().iterator(); + final Iterator i = links.keySet().iterator(); while ( i.hasNext() ) { if ( !i.next().getHost().endsWith(host) ) { i.remove(); @@ -3447,23 +3447,23 @@ public final class Switchboard extends serverSwitch { public void run() { // get the links for a specific site - final DigestURI startUrl; + final DigestURL startUrl; try { - startUrl = new DigestURI(url); + startUrl = new DigestURL(url); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); return; } - final Map links; - DigestURI url; + final Map links; + DigestURL url; try { links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages - final Iterator i = links.keySet().iterator(); + final Iterator i = links.keySet().iterator(); final boolean globalcrawljob = Switchboard.this.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL,false); - Collection urls = new ArrayList(); + Collection urls = new ArrayList(); while (i.hasNext()) { url = i.next(); boolean islocal = (url.getHost() == null && startUrl.getHost() == null) || (url.getHost() != null && startUrl.getHost() != null && url.getHost().contentEquals(startUrl.getHost())); @@ -3509,9 +3509,9 @@ public final class Switchboard extends serverSwitch { urlpattern.substring(0, p) + queryString.trim().replaceAll(" ", "+") + urlpattern.substring(p + 1); - final DigestURI url; + final DigestURL url; try { - url = new DigestURI(MultiProtocolURI.unescape(urlString)); + url = new DigestURL(MultiProtocolURL.unescape(urlString)); } catch (final MalformedURLException e1 ) { ConcurrentLog.warn("heuristicRSS", "url not well-formed: '" + urlString + "'"); return; @@ -3527,11 +3527,11 @@ public final class Switchboard extends serverSwitch { //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); if ( rss != null ) { - final Map links = new TreeMap(); - DigestURI uri; + final Map links = new TreeMap(); + DigestURL uri; for ( final RSSMessage message : rss.getFeed() ) { try { - uri = new DigestURI(message.getLink()); + uri = new DigestURL(message.getLink()); links.put(uri, message.getTitle()); } catch (final MalformedURLException e ) { } @@ -3649,7 +3649,7 @@ public final class Switchboard extends serverSwitch { public void run() { // load the seed list try { - DigestURI url = new DigestURI(seedListFileURL); + DigestURL url = new DigestURL(seedListFileURL); //final long start = System.currentTimeMillis(); final RequestHeader reqHeader = new RequestHeader(); reqHeader.put(HeaderFramework.PRAGMA, "no-cache"); diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 658e01fb1..cc4bed207 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -34,13 +34,13 @@ import java.util.concurrent.LinkedBlockingQueue; import org.apache.solr.common.SolrInputDocument; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.WebgraphConfiguration; @@ -52,14 +52,14 @@ import net.yacy.search.schema.WebgraphConfiguration; */ public class DocumentIndex extends Segment { - private static DigestURI poison; + private static DigestURL poison; static { try { - poison = new DigestURI("file://."); + poison = new DigestURL("file://."); } catch (final MalformedURLException e ) { } } - BlockingQueue queue; // a queue of document ID's + BlockingQueue queue; // a queue of document ID's private final Worker[] worker; CallbackListener callback; @@ -80,7 +80,7 @@ public class DocumentIndex extends Segment { super.fulltext().connectLocalSolr(); super.fulltext().writeWebgraph(true); this.callback = callback; - this.queue = new LinkedBlockingQueue(WorkflowProcessor.availableCPU * 300); + this.queue = new LinkedBlockingQueue(WorkflowProcessor.availableCPU * 300); this.worker = new Worker[WorkflowProcessor.availableCPU]; for ( int i = 0; i < WorkflowProcessor.availableCPU; i++ ) { this.worker[i] = new Worker(i); @@ -96,7 +96,7 @@ public class DocumentIndex extends Segment { @Override public void run() { - DigestURI f; + DigestURL f; SolrInputDocument[] resultRows; try { while ( (f = DocumentIndex.this.queue.take()) != poison ) { @@ -134,7 +134,7 @@ public class DocumentIndex extends Segment { this.queue.clear(); } - private SolrInputDocument[] add(final DigestURI url) throws IOException { + private SolrInputDocument[] add(final DigestURL url) throws IOException { if ( url == null ) { throw new IOException("file = null"); } @@ -183,7 +183,7 @@ public class DocumentIndex extends Segment { * * @param start */ - public void addConcurrent(final DigestURI start) throws IOException { + public void addConcurrent(final DigestURL start) throws IOException { assert (start != null); assert (start.canRead()) : start.toString(); if ( !start.isDirectory() ) { @@ -194,10 +194,10 @@ public class DocumentIndex extends Segment { return; } final String[] s = start.list(); - DigestURI w; + DigestURL w; for ( final String t : s ) { try { - w = new DigestURI(start, t); + w = new DigestURL(start, t); if ( w.canRead() && !w.isHidden() ) { if ( w.isDirectory() ) { addConcurrent(w); @@ -240,9 +240,9 @@ public class DocumentIndex extends Segment { public interface CallbackListener { - public void commit(DigestURI f, SolrInputDocument resultRow); + public void commit(DigestURL f, SolrInputDocument resultRow); - public void fail(DigestURI f, String failReason); + public void fail(DigestURL f, String failReason); } } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 423e2110c..f3496e72f 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -42,8 +42,9 @@ import java.util.regex.Pattern; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; @@ -62,7 +63,6 @@ import net.yacy.cora.storage.ZIPReader; import net.yacy.cora.storage.ZIPWriter; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.parser.html.CharacterCoding; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReferenceVars; @@ -319,7 +319,7 @@ public final class Fulltext { return x; } - public DigestURI getURL(final byte[] urlHash) { + public DigestURL getURL(final byte[] urlHash) { if (urlHash == null || this.getDefaultConnector() == null) return null; String x; @@ -330,7 +330,7 @@ public final class Fulltext { } if (x == null) return null; try { - DigestURI uri = new DigestURI(x, urlHash); + DigestURL uri = new DigestURL(x, urlHash); return uri; } catch (final MalformedURLException e) { return null; @@ -531,8 +531,8 @@ public final class Fulltext { * @param concurrently if true, then the method returnes immediately and runs concurrently */ public int remove(final String basepath, Date freshdate) { - DigestURI uri; - try {uri = new DigestURI(basepath);} catch (final MalformedURLException e) {return 0;} + DigestURL uri; + try {uri = new DigestURL(basepath);} catch (final MalformedURLException e) {return 0;} final String host = uri.getHost(); final String collectionQuery = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" + ((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); @@ -829,12 +829,12 @@ public final class Fulltext { pw.println(url); } if (this.format == 1) { - if (title != null) pw.println("" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + ""); + if (title != null) pw.println("" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + ""); } if (this.format == 2) { pw.println(""); if (title != null) pw.println("" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + ""); - pw.println("" + MultiProtocolURI.escape(url) + ""); + pw.println("" + MultiProtocolURL.escape(url) + ""); if (author != null && !author.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(author, true) + ""); if (descriptions != null && descriptions.length > 0) { for (String d: descriptions) pw.println("" + CharacterCoding.unicode2xml(d, true) + ""); @@ -889,7 +889,7 @@ public final class Fulltext { count += 10; // make some more to prevent that we have to do this again after deletions too soon. if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size(); this.statsDump = new ArrayList(); - DigestURI url; + DigestURL url; while (j.hasNext()) { urlhash = j.next(); if (urlhash == null) continue; diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index e6092b92e..abf4fc67e 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -41,9 +41,10 @@ import java.util.regex.Pattern; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.yacy.CacheStrategy; @@ -62,7 +63,6 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReferenceFactory; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -216,10 +216,10 @@ public class Segment { * @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached * @throws IOException */ - public int getClickDepth(final DigestURI url) throws IOException { + public int getClickDepth(final DigestURL url) throws IOException { final byte[] searchhash = url.hash(); - RowHandleSet rootCandidates = url.getPossibleRootHashes(); + RowHandleSet rootCandidates = getPossibleRootHashes(url); RowHandleSet ignore = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops RowHandleSet levelhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry @@ -267,6 +267,25 @@ public class Segment { } return 999; } + + private static RowHandleSet getPossibleRootHashes(DigestURL url) { + RowHandleSet rootCandidates = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); + String rootStub = url.getProtocol() + "://" + url.getHost(); + try { + rootCandidates.put(new DigestURL(rootStub).hash()); + rootCandidates.put(new DigestURL(rootStub + "/").hash()); + rootCandidates.put(new DigestURL(rootStub + "/index.htm").hash()); + rootCandidates.put(new DigestURL(rootStub + "/index.html").hash()); + rootCandidates.put(new DigestURL(rootStub + "/index.php").hash()); + rootCandidates.put(new DigestURL(rootStub + "/home.htm").hash()); + rootCandidates.put(new DigestURL(rootStub + "/home.html").hash()); + rootCandidates.put(new DigestURL(rootStub + "/home.php").hash()); + rootCandidates.put(new DigestURL(rootStub + "/default.htm").hash()); + rootCandidates.put(new DigestURL(rootStub + "/default.html").hash()); + rootCandidates.put(new DigestURL(rootStub + "/default.php").hash()); + } catch (final Throwable e) {} + return rootCandidates; + } public ReferenceReportCache getReferenceReportCache() { return new ReferenceReportCache(); @@ -428,7 +447,7 @@ public class Segment { * @param stub * @return an iterator for all matching urls */ - public Iterator urlSelector(final MultiProtocolURI stub, final long maxtime, final int maxcount) { + public Iterator urlSelector(final MultiProtocolURL stub, final long maxtime, final int maxcount) { final BlockingQueue docQueue; final String urlstub; if (stub == null) { @@ -436,15 +455,15 @@ public class Segment { urlstub = null; } else { final String host = stub.getHost(); - String hh = DigestURI.hosthash(host); + String hh = DigestURL.hosthash(host); docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, maxtime, maxcount, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); urlstub = stub.toNormalform(true); } // now filter the stub from the iterated urls - return new LookAheadIterator() { + return new LookAheadIterator() { @Override - protected DigestURI next0() { + protected DigestURL next0() { while (true) { SolrDocument doc; try { @@ -456,9 +475,9 @@ public class Segment { if (doc == null || doc == AbstractSolrConnector.POISON_DOCUMENT) return null; String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - DigestURI url; + DigestURL url; try { - url = new DigestURI(u, ASCII.getBytes(id)); + url = new DigestURL(u, ASCII.getBytes(id)); } catch (final MalformedURLException e) { continue; } @@ -498,7 +517,7 @@ public class Segment { } private static String votedLanguage( - final DigestURI url, + final DigestURL url, final String urlNormalform, final Document document, final Condenser condenser) { @@ -573,8 +592,8 @@ public class Segment { } public SolrInputDocument storeDocument( - final DigestURI url, - final DigestURI referrerURL, + final DigestURL url, + final DigestURL referrerURL, final Map collections, final ResponseHeader responseHeader, final Document document, @@ -635,7 +654,7 @@ public class Segment { int outlinksSame = document.inboundLinks().size(); int outlinksOther = document.outboundLinks().size(); final int urlLength = urlNormalform.length(); - final int urlComps = MultiProtocolURI.urlComps(url.toString()).length; + final int urlComps = MultiProtocolURL.urlComps(url.toString()).length; // create a word prototype which is re-used for all entries if ((this.termIndex != null && storeToRWI) || searchEvent != null) { @@ -728,7 +747,7 @@ public class Segment { if (urlhash == null) return 0; // determine the url string - final DigestURI url = fulltext().getURL(urlhash); + final DigestURL url = fulltext().getURL(urlhash); if (url == null) return 0; try { diff --git a/source/net/yacy/search/query/AccessTracker.java b/source/net/yacy/search/query/AccessTracker.java index f299f2cec..eee6e3d38 100644 --- a/source/net/yacy/search/query/AccessTracker.java +++ b/source/net/yacy/search/query/AccessTracker.java @@ -35,8 +35,8 @@ import java.util.Iterator; import java.util.LinkedList; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.UTF8; import net.yacy.cora.document.WordCache; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.util.MemoryControl; diff --git a/source/net/yacy/search/query/QueryModifier.java b/source/net/yacy/search/query/QueryModifier.java index e287c8f3f..9908f8646 100644 --- a/source/net/yacy/search/query/QueryModifier.java +++ b/source/net/yacy/search/query/QueryModifier.java @@ -24,8 +24,8 @@ import java.util.ArrayList; import org.apache.solr.common.params.CommonParams; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.CommonPattern; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; @@ -104,7 +104,7 @@ public class QueryModifier { while ( sitehost.endsWith(".") ) { sitehost = sitehost.substring(0, sitehost.length() - 1); } - sitehash = DigestURI.hosthash(sitehost); + sitehash = DigestURL.hosthash(sitehost); add("site:" + sitehost); } diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 6fcdd6ef5..f4c540839 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -40,9 +40,10 @@ import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.common.params.FacetParams; -import net.yacy.cora.document.ASCII; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; @@ -53,7 +54,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.Condenser; import net.yacy.document.LibraryProvider; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.index.RowHandleSet; @@ -148,7 +148,7 @@ public final class QueryParams { this.targetlang = "en"; this.metatags = new ArrayList(0); this.domType = Searchdom.LOCAL; - this.zonecode = DigestURI.TLD_any_zone_filter; + this.zonecode = DigestURL.TLD_any_zone_filter; this.constraint = constraint; this.allofconstraint = false; this.snippetCacheStrategy = null; @@ -550,11 +550,11 @@ public final class QueryParams { return this.queryGoal; } - public final Map separateMatches(final Map links) { - final Map matcher = new HashMap(); - final Iterator > i = links.entrySet().iterator(); - Map.Entry entry; - DigestURI url; + public final Map separateMatches(final Map links) { + final Map matcher = new HashMap(); + final Iterator > i = links.entrySet().iterator(); + Map.Entry entry; + DigestURL url; String anchorText; while (i.hasNext()) { entry = i.next(); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 864318075..a34300d82 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -44,11 +44,12 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import net.yacy.contentcontrol.ContentControlFilterUpdateThread; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.Distribution; import net.yacy.cora.lod.vocabulary.Tagging; @@ -68,7 +69,6 @@ import net.yacy.document.Condenser; import net.yacy.document.LargeNumberCache; import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -96,6 +96,7 @@ import net.yacy.search.schema.CollectionSchema; import net.yacy.search.snippet.ResultEntry; import net.yacy.search.snippet.TextSnippet; import net.yacy.search.snippet.TextSnippet.ResultClass; + import org.apache.solr.common.SolrDocument; public final class SearchEvent { @@ -844,7 +845,7 @@ public final class SearchEvent { } // filter out media links in text search, if wanted - String ext = MultiProtocolURI.getFileExtension(iEntry.url().getFileName()); + String ext = MultiProtocolURL.getFileExtension(iEntry.url().getFileName()); if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) { if (log.isFine()) log.fine("dropped Node: file name domain does not match"); continue pollloop; @@ -1037,7 +1038,7 @@ public final class SearchEvent { } // filter out media links in text search, if wanted - String ext = MultiProtocolURI.getFileExtension(page.url().getFileName()); + String ext = MultiProtocolURL.getFileExtension(page.url().getFileName()); if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) { if (log.isFine()) log.fine("dropped RWI: file name domain does not match"); continue; @@ -1267,8 +1268,8 @@ public final class SearchEvent { // apply 'common-sense' heuristic using references final String urlstring = rentry.url().toNormalform(true); - final String[] urlcomps = MultiProtocolURI.urlComps(urlstring); - final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase()); + final String[] urlcomps = MultiProtocolURL.urlComps(urlstring); + final String[] descrcomps = MultiProtocolURL.splitpattern.split(rentry.title().toLowerCase()); for (final String urlcomp : urlcomps) { int tc = topwords.get(urlcomp); if (tc > 0) r += Math.max(1, tc) << this.query.ranking.coeff_urlcompintoplist; @@ -1422,7 +1423,7 @@ public final class SearchEvent { String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : ""; if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) { try { - DigestURI imageUrl = new DigestURI((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i); + DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i); Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c); Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c); String id = ASCII.String(imageUrl.hash()); @@ -1434,12 +1435,12 @@ public final class SearchEvent { c++; } } - if (MultiProtocolURI.isImage(MultiProtocolURI.getFileExtension(ms.url().getFileName()))) { + if (MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(ms.url().getFileName()))) { String id = ASCII.String(ms.hash()); if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0)); } if (img != null && img.size() > 0) { - DigestURI imageUrl = new DigestURI((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0)); + DigestURL imageUrl = new DigestURL((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0)); String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : ""; String id = ASCII.String(imageUrl.hash()); if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0)); @@ -1450,10 +1451,10 @@ public final class SearchEvent { } public class ImageResult { - public DigestURI imageUrl, sourceUrl; + public DigestURL imageUrl, sourceUrl; public String mimetype = "", imagetext = ""; public int width = 0, height = 0, fileSize = 0; - public ImageResult(DigestURI sourceUrl, DigestURI imageUrl, String mimetype, String imagetext, int width, int height, int fileSize) { + public ImageResult(DigestURL sourceUrl, DigestURL imageUrl, String mimetype, String imagetext, int width, int height, int fileSize) { this.sourceUrl = sourceUrl; this.imageUrl = imageUrl; this.mimetype = mimetype; @@ -1638,7 +1639,7 @@ public final class SearchEvent { protected void addTopics(final ResultEntry resultEntry) { // take out relevant information for reference computation if ((resultEntry.url() == null) || (resultEntry.title() == null)) return; - final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description + final String[] descrcomps = MultiProtocolURL.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description // add references addTopic(descrcomps); diff --git a/source/net/yacy/search/ranking/BlockRank.java b/source/net/yacy/search/ranking/BlockRank.java index 3dd0f407e..c59e5d743 100644 --- a/source/net/yacy/search/ranking/BlockRank.java +++ b/source/net/yacy/search/ranking/BlockRank.java @@ -32,7 +32,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.Base64Order; import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.cora.sorting.ScoreMap; diff --git a/source/net/yacy/search/ranking/ReferenceOrder.java b/source/net/yacy/search/ranking/ReferenceOrder.java index d3c27e70a..10fd7373c 100644 --- a/source/net/yacy/search/ranking/ReferenceOrder.java +++ b/source/net/yacy/search/ranking/ReferenceOrder.java @@ -33,12 +33,12 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Condenser; import net.yacy.document.LargeNumberCache; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -229,7 +229,7 @@ public class ReferenceOrder { final int maxmaxpos = this.max.maxposition(); final int minminpos = this.min.minposition(); final long r = - ((256 - DigestURI.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength) + ((256 - DigestURL.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength) + ((this.max.urlcomps() == this.min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - this.min.urlcomps() ) << 8) / (this.max.urlcomps() - this.min.urlcomps()) )) << this.ranking.coeff_urlcomps) + ((this.max.urllength() == this.min.urllength() ) ? 0 : (256 - (((t.urllength() - this.min.urllength() ) << 8) / (this.max.urllength() - this.min.urllength()) )) << this.ranking.coeff_urllength) + ((maxmaxpos == minminpos) ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext) @@ -270,7 +270,7 @@ public class ReferenceOrder { assert t != null; assert this.ranking != null; final long r = - ((256 - DigestURI.domLengthNormalized(t.hash())) << this.ranking.coeff_domlength) + ((256 - DigestURL.domLengthNormalized(t.hash())) << this.ranking.coeff_domlength) + ((256 - (t.urllength() << 8)) << this.ranking.coeff_urllength) + (t.virtualAge() << this.ranking.coeff_date) + (t.wordsintitle()<< this.ranking.coeff_wordsintitle) diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 107c04686..9c2db013d 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -35,19 +35,20 @@ import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import java.util.regex.Pattern; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.FailType; @@ -72,7 +73,6 @@ import net.yacy.document.SentenceReader; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.citation.CitationReference; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.RowHandleMap; import net.yacy.kelondro.rwi.IndexCell; @@ -204,7 +204,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param doctype * @return the normalized url */ - public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURI digestURI, final char doctype) { + public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURI, final char doctype) { add(doc, CollectionSchema.id, ASCII.String(digestURI.hash())); String us = digestURI.toNormalform(true); add(doc, CollectionSchema.sku, us); @@ -228,7 +228,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // path elements of link String filename = digestURI.getFileName(); - String extension = MultiProtocolURI.getFileExtension(filename); + String extension = MultiProtocolURL.getFileExtension(filename); if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length()); if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURI.getProtocol()); if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths()); @@ -357,12 +357,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri public SolrVector yacy2solr( final String id, final Map collections, final ResponseHeader responseHeader, - final Document document, final Condenser condenser, final DigestURI referrerURL, final String language, + final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final IndexCell citations, final WebgraphConfiguration webgraph) { // we use the SolrCell design as index schema SolrVector doc = new SolrVector(); - final DigestURI digestURI = document.dc_source(); + final DigestURL digestURI = document.dc_source(); boolean allAttr = this.isEmpty(); String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI)); @@ -464,12 +464,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } // get list of all links; they will be shrinked by urls that appear in other fields of the solr schema - Set inboundLinks = document.inboundLinks(); - Set outboundLinks = document.outboundLinks(); + LinkedHashMap inboundLinks = document.inboundLinks(); + LinkedHashMap outboundLinks = document.outboundLinks(); Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); - Map alllinks = document.getAnchors(); - Map images = new HashMap(); + List images = new ArrayList(); int c = 0; final Object parser = document.getParserObject(); boolean containsCanonical = false; @@ -582,18 +581,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (li.length > 0) add(doc, CollectionSchema.li_txt, li); // images - final Collection imagesc = images.values(); - final ArrayList imgprots = new ArrayList(imagesc.size()); - final Integer[] imgheights = new Integer[imagesc.size()]; - final Integer[] imgwidths = new Integer[imagesc.size()]; - final Integer[] imgpixels = new Integer[imagesc.size()]; - final String[] imgstubs = new String[imagesc.size()]; - final String[] imgalts = new String[imagesc.size()]; + final ArrayList imgprots = new ArrayList(images.size()); + final Integer[] imgheights = new Integer[images.size()]; + final Integer[] imgwidths = new Integer[images.size()]; + final Integer[] imgpixels = new Integer[images.size()]; + final String[] imgstubs = new String[images.size()]; + final String[] imgalts = new String[images.size()]; int withalt = 0; int i = 0; LinkedHashSet images_text_map = new LinkedHashSet(); - for (final ImageEntry ie: imagesc) { - final MultiProtocolURI uri = ie.url(); + for (final ImageEntry ie: images) { + final MultiProtocolURL uri = ie.url(); inboundLinks.remove(uri); outboundLinks.remove(uri); imgheights[i] = ie.height(); @@ -613,7 +611,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } StringBuilder images_text = new StringBuilder(images_text_map.size() * 6 + 1); for (String s: images_text_map) images_text.append(s.trim()).append(' '); - if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, imagesc.size()); + if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, images.size()); if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots)); if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs); if (allAttr || contains(CollectionSchema.images_alt_sxt)) add(doc, CollectionSchema.images_alt_sxt, imgalts); @@ -625,11 +623,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // style sheets if (allAttr || contains(CollectionSchema.css_tag_sxt)) { - final Map csss = html.getCSS(); + final Map csss = html.getCSS(); final String[] css_tag = new String[csss.size()]; final String[] css_url = new String[csss.size()]; c = 0; - for (final Map.Entry entry: csss.entrySet()) { + for (final Map.Entry entry: csss.entrySet()) { final String cssurl = entry.getKey().toNormalform(false); inboundLinks.remove(cssurl); outboundLinks.remove(cssurl); @@ -646,10 +644,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // Scripts if (allAttr || contains(CollectionSchema.scripts_sxt)) { - final Set scriptss = html.getScript(); + final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; - for (final DigestURI u: scriptss) { + for (final DigestURL u: scriptss) { inboundLinks.remove(u); outboundLinks.remove(u); scripts[c++] = u.toNormalform(false); @@ -660,10 +658,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // Frames if (allAttr || contains(CollectionSchema.frames_sxt)) { - final Set framess = html.getFrames(); + final Set framess = html.getFrames(); final String[] frames = new String[framess.size()]; c = 0; - for (final DigestURI u: framess) { + for (final DigestURL u: framess) { inboundLinks.remove(u); outboundLinks.remove(u); frames[c++] = u.toNormalform(false); @@ -677,10 +675,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // IFrames if (allAttr || contains(CollectionSchema.iframes_sxt)) { - final Set iframess = html.getIFrames(); + final Set iframess = html.getIFrames(); final String[] iframes = new String[iframess.size()]; c = 0; - for (final DigestURI u: iframess) { + for (final DigestURL u: iframess) { inboundLinks.remove(u); outboundLinks.remove(u); iframes[c++] = u.toNormalform(false); @@ -694,7 +692,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // canonical tag if (allAttr || contains(CollectionSchema.canonical_s)) { - final DigestURI canonical = html.getCanonical(); + final DigestURL canonical = html.getCanonical(); if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) { containsCanonical = true; inboundLinks.remove(canonical); @@ -711,9 +709,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.refresh_s)) { String refresh = html.getRefreshPath(); if (refresh != null && refresh.length() > 0) { - MultiProtocolURI refreshURL; + MultiProtocolURL refreshURL; try { - refreshURL = refresh.startsWith("http") ? new MultiProtocolURI(html.getRefreshPath()) : new MultiProtocolURI(digestURI, html.getRefreshPath()); + refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURI, html.getRefreshPath()); if (refreshURL != null) { inboundLinks.remove(refreshURL); outboundLinks.remove(refreshURL); @@ -727,8 +725,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // flash embedded if (allAttr || contains(CollectionSchema.flash_b)) { - MultiProtocolURI[] flashURLs = html.getFlash(); - for (MultiProtocolURI u: flashURLs) { + MultiProtocolURL[] flashURLs = html.getFlash(); + for (MultiProtocolURL u: flashURLs) { // remove all flash links from ibound/outbound links inboundLinks.remove(u); outboundLinks.remove(u); @@ -755,7 +753,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final String[] ccs = new String[html.getHreflang().size()]; final String[] urls = new String[html.getHreflang().size()]; c = 0; - for (Map.Entry e: html.getHreflang().entrySet()) { + for (Map.Entry e: html.getHreflang().entrySet()) { ccs[c] = e.getKey(); urls[c] = e.getValue().toNormalform(true); c++; @@ -769,7 +767,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final String[] navs = new String[html.getNavigation().size()]; final String[] urls = new String[html.getNavigation().size()]; c = 0; - for (Map.Entry e: html.getNavigation().entrySet()) { + for (Map.Entry e: html.getNavigation().entrySet()) { navs[c] = e.getKey(); urls[c] = e.getValue().toNormalform(true); c++; @@ -790,7 +788,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri content = digestURI.toTokens(); } - if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURI.isImage(MultiProtocolURI.getFileExtension(digestURI.getFileName()))) { + if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURI.getFileName()))) { add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser content = digestURI.toTokens(); // remove all other entry but the url tokens } @@ -816,8 +814,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // create a subgraph if (!containsCanonical) { // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document - webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations); - webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations); + webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations); } // list all links @@ -919,7 +916,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Collection proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName()); try { - DigestURI url = new DigestURI((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()))); + DigestURL url = new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()))); byte[] id = url.hash(); SolrInputDocument sid = this.toSolrInputDocument(doc); @@ -1206,7 +1203,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param httpstatus * @throws IOException */ - public SolrInputDocument err(final DigestURI digestURI, final Map collections, final String failReason, final FailType failType, final int httpstatus) throws IOException { + public SolrInputDocument err(final DigestURL digestURI, final Map collections, final String failReason, final FailType failType, final int httpstatus) throws IOException { boolean allAttr = this.isEmpty(); assert allAttr || contains(CollectionSchema.failreason_s); diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index e3bf994ef..ad9a1c241 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -42,8 +42,10 @@ import java.util.regex.Pattern; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.ProcessType; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.SchemaDeclaration; @@ -55,7 +57,6 @@ import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.citation.CitationReference; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.rwi.IndexCell; import net.yacy.search.index.Segment; @@ -115,16 +116,15 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial public void addEdges( final Subgraph subgraph, - final DigestURI source, final ResponseHeader responseHeader, Map collections, int clickdepth_source, - final Map alllinks, final Map images, - final boolean inbound, final Set links, + final DigestURL source, final ResponseHeader responseHeader, Map collections, int clickdepth_source, + final List images, final boolean inbound, final Collection links, final IndexCell citations) { boolean allAttr = this.isEmpty(); - for (final DigestURI target_url: links) { + for (final AnchorURL target_url: links) { Set processTypes = new LinkedHashSet(); - final Properties p = alllinks.get(target_url); + final Properties p = target_url.getProperties(); if (p == null) continue; final String name = p.getProperty("name", ""); // the name attribute final String text = p.getProperty("text", ""); // the text between the tag @@ -186,7 +186,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) { String source_file_name = source.getFileName(); - String source_file_ext = MultiProtocolURI.getFileExtension(source_file_name); + String source_file_ext = MultiProtocolURL.getFileExtension(source_file_name); add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name); add(edge, WebgraphSchema.source_file_ext_s, source_file_ext); } @@ -209,7 +209,11 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : ""); if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length()); if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); - ImageEntry ientry = images.get(target_url); + + ImageEntry ientry = null; + for (ImageEntry ie: images) { + if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;} + } String alttext = ientry == null ? "" : ientry.alt(); if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext); if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length()); @@ -248,7 +252,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) { String target_file_name = target_url.getFileName(); - String target_file_ext = MultiProtocolURI.getFileExtension(target_file_name); + String target_file_ext = MultiProtocolURL.getFileExtension(target_file_name); add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name); add(edge, WebgraphSchema.target_file_ext_s, target_file_ext); } @@ -295,7 +299,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial SolrDocument doc; String protocol, urlstub, id; - DigestURI url; + DigestURL url; int proccount = 0, proccount_clickdepthchange = 0; try { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { @@ -313,14 +317,14 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial protocol = (String) doc.getFieldValue(WebgraphSchema.source_protocol_s.getSolrFieldName()); urlstub = (String) doc.getFieldValue(WebgraphSchema.source_urlstub_s.getSolrFieldName()); id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); - url = new DigestURI(protocol + "://" + urlstub, ASCII.getBytes(id)); + url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id)); if (postprocessing_clickdepth(segment, doc, sid, url, WebgraphSchema.source_clickdepth_i)) proccount_clickdepthchange++; } if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) { protocol = (String) doc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()); urlstub = (String) doc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName()); id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()); - url = new DigestURI(protocol + "://" + urlstub, ASCII.getBytes(id)); + url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id)); if (postprocessing_clickdepth(segment, doc, sid, url, WebgraphSchema.target_clickdepth_i)) proccount_clickdepthchange++; } } diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index 988c37e6a..0ac454cde 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -35,9 +35,10 @@ import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeSet; -import net.yacy.cora.document.ASCII; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; @@ -52,7 +53,6 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.WordTokenizer; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; @@ -60,13 +60,13 @@ import net.yacy.search.Switchboard; public class MediaSnippet implements Comparable, Comparator { public ContentDomain type; - public DigestURI href, source; + public DigestURL href, source; public String name, attr, mime; public long ranking; public int width, height; public long fileSize; - public MediaSnippet(final ContentDomain type, final DigestURI href, final String mime, final String name, final long fileSize, final String attr, final long ranking, final DigestURI source) { + public MediaSnippet(final ContentDomain type, final DigestURL href, final String mime, final String name, final long fileSize, final String attr, final long ranking, final DigestURL source) { this.type = type; this.href = href; this.mime = mime; @@ -86,7 +86,7 @@ public class MediaSnippet implements Comparable, Comparator, Comparator retrieveMediaSnippets(final DigestURI url, final HandleSet queryhashes, final Classification.ContentDomain mediatype, final CacheStrategy cacheStrategy, final boolean reindexing) { + public static List retrieveMediaSnippets(final DigestURL url, final HandleSet queryhashes, final Classification.ContentDomain mediatype, final CacheStrategy cacheStrategy, final boolean reindexing) { if (queryhashes.isEmpty()) { ConcurrentLog.fine("snippet fetch", "no query hashes given for url " + url); return new ArrayList(); @@ -161,18 +161,18 @@ public class MediaSnippet implements Comparable, Comparator computeMediaSnippets(final DigestURI source, final Document document, final HandleSet queryhashes, final ContentDomain mediatype) { + public static List computeMediaSnippets(final DigestURL source, final Document document, final HandleSet queryhashes, final ContentDomain mediatype) { if (document == null) return new ArrayList(); - Map media = null; + Map media = null; if (mediatype == ContentDomain.AUDIO) media = document.getAudiolinks(); else if (mediatype == ContentDomain.VIDEO) media = document.getVideolinks(); else if (mediatype == ContentDomain.APP) media = document.getApplinks(); if (media == null) return null; - final Iterator> i = media.entrySet().iterator(); - Map.Entry entry; - DigestURI url; + final Iterator> i = media.entrySet().iterator(); + Map.Entry entry; + DigestURL url; String desc; final List result = new ArrayList(); while (i.hasNext()) { @@ -189,7 +189,7 @@ public class MediaSnippet implements Comparable, Comparator computeImageSnippets(final DigestURI source, final Document document, final HandleSet queryhashes) { + public static List computeImageSnippets(final DigestURL source, final Document document, final HandleSet queryhashes) { final SortedSet images = new TreeSet(); images.addAll(document.getImages().values()); // iterates images in descending size order! @@ -197,7 +197,7 @@ public class MediaSnippet implements Comparable, Comparator i = images.iterator(); ImageEntry ientry; - DigestURI url; + DigestURL url; String desc; final List result = new ArrayList(); while (i.hasNext()) { @@ -253,7 +253,7 @@ public class MediaSnippet implements Comparable, Comparator, Comparator, Comparator, Comparator conProp, final DigestURI url, final RequestHeader requestHeader, final ResponseHeader cachedResponseHeader, final OutputStream respond, final ClientIdentification.Agent agent) { + private static void fulfillRequestFromWeb(final HashMap conProp, final DigestURL url, final RequestHeader requestHeader, final ResponseHeader cachedResponseHeader, final OutputStream respond, final ClientIdentification.Agent agent) { try { final boolean proxyAugmentation = sb.getConfigBool("proxyAugmentation", false); final int reqID = requestHeader.hashCode(); @@ -695,7 +695,7 @@ public final class HTTPDProxyHandler { private static void fulfillRequestFromCache( final HashMap conProp, - final DigestURI url, + final DigestURL url, final RequestHeader requestHeader, final ResponseHeader cachedResponseHeader, final byte[] cacheEntry, @@ -764,7 +764,7 @@ public final class HTTPDProxyHandler { public static void doHead(final HashMap conProp, final RequestHeader requestHeader, OutputStream respond, final ClientIdentification.Agent agent) { // ResponseContainer res = null; - DigestURI url = null; + DigestURL url = null; try { final int reqID = requestHeader.hashCode(); // remembering the starting time of the request @@ -790,7 +790,7 @@ public final class HTTPDProxyHandler { } try { - url = new DigestURI("http", host, port, (args == null) ? path : path + "?" + args); + url = new DigestURL("http", host, port, (args == null) ? path : path + "?" + args); } catch (final MalformedURLException e) { final String errorMsg = "ERROR: internal error with url generation: host=" + host + ", port=" + port + ", path=" + path + ", args=" + args; @@ -881,7 +881,7 @@ public final class HTTPDProxyHandler { assert conProp != null : "precondition violated: conProp != null"; assert requestHeader != null : "precondition violated: requestHeader != null"; assert body != null : "precondition violated: body != null"; - DigestURI url = null; + DigestURL url = null; ByteCountOutputStream countedRespond = null; try { final int reqID = requestHeader.hashCode(); @@ -908,7 +908,7 @@ public final class HTTPDProxyHandler { } try { - url = new DigestURI("http", host, port, (args == null) ? path : path + "?" + args); + url = new DigestURL("http", host, port, (args == null) ? path : path + "?" + args); } catch (final MalformedURLException e) { final String errorMsg = "ERROR: internal error with url generation: host=" + host + ", port=" + port + ", path=" + path + ", args=" + args; @@ -1358,7 +1358,7 @@ public final class HTTPDProxyHandler { } } - private static void handleProxyException(final Exception e, final HashMap conProp, final OutputStream respond, final DigestURI url) { + private static void handleProxyException(final Exception e, final HashMap conProp, final OutputStream respond, final DigestURL url) { // this may happen if // - the targeted host does not exist // - anything with the remote server was wrong. diff --git a/source/net/yacy/server/http/HTTPDemon.java b/source/net/yacy/server/http/HTTPDemon.java index 451fd14de..de5274283 100644 --- a/source/net/yacy/server/http/HTTPDemon.java +++ b/source/net/yacy/server/http/HTTPDemon.java @@ -52,8 +52,9 @@ import java.util.concurrent.ConcurrentMap; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; @@ -65,7 +66,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.NumberTools; import net.yacy.data.UserDB; import net.yacy.document.parser.html.CharacterCoding; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; import net.yacy.search.Switchboard; @@ -1071,7 +1071,7 @@ public final class HTTPDemon implements serverHandler, Cloneable { String urlString; try { - urlString = (new DigestURI((method.equals(HeaderFramework.METHOD_CONNECT)?"https":"http"), host, port, (args == null) ? path : path + "?" + args)).toString(); + urlString = (new DigestURL((method.equals(HeaderFramework.METHOD_CONNECT)?"https":"http"), host, port, (args == null) ? path : path + "?" + args)).toString(); } catch (final MalformedURLException e) { urlString = "invalid URL"; } diff --git a/source/net/yacy/server/http/ServerSideIncludes.java b/source/net/yacy/server/http/ServerSideIncludes.java index b452fa59e..e2a848f33 100644 --- a/source/net/yacy/server/http/ServerSideIncludes.java +++ b/source/net/yacy/server/http/ServerSideIncludes.java @@ -31,7 +31,7 @@ import java.io.IOException; import java.io.OutputStream; import java.util.HashMap; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ByteBuffer; diff --git a/source/net/yacy/server/http/TemplateEngine.java b/source/net/yacy/server/http/TemplateEngine.java index 4d9bb0a87..41074eb83 100644 --- a/source/net/yacy/server/http/TemplateEngine.java +++ b/source/net/yacy/server/http/TemplateEngine.java @@ -56,8 +56,8 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PushbackInputStream; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.util.FileUtils; diff --git a/source/net/yacy/server/serverCore.java b/source/net/yacy/server/serverCore.java index 2053094b8..2f1d962f7 100644 --- a/source/net/yacy/server/serverCore.java +++ b/source/net/yacy/server/serverCore.java @@ -58,7 +58,7 @@ import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSocket; import javax.net.ssl.SSLSocketFactory; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.protocol.Domains; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; diff --git a/source/net/yacy/server/serverObjects.java b/source/net/yacy/server/serverObjects.java index f844d8f28..62310a801 100644 --- a/source/net/yacy/server/serverObjects.java +++ b/source/net/yacy/server/serverObjects.java @@ -59,8 +59,8 @@ import java.util.Map; import java.util.Set; import java.util.regex.Pattern; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader.FileType; import net.yacy.document.parser.html.CharacterCoding; @@ -531,9 +531,9 @@ public class serverObjects implements Serializable, Cloneable { if (this.map.getMap().isEmpty()) return ""; final StringBuilder param = new StringBuilder(this.map.getMap().size() * 40); for (final Map.Entry entry: entrySet()) { - param.append(MultiProtocolURI.escape(entry.getKey())) + param.append(MultiProtocolURL.escape(entry.getKey())) .append('=') - .append(MultiProtocolURI.escape(entry.getValue())) + .append(MultiProtocolURL.escape(entry.getValue())) .append('&'); } param.setLength(param.length() - 1); diff --git a/source/net/yacy/utils/bitfield.java b/source/net/yacy/utils/bitfield.java index 81c1478ce..ea541245f 100644 --- a/source/net/yacy/utils/bitfield.java +++ b/source/net/yacy/utils/bitfield.java @@ -20,7 +20,7 @@ package net.yacy.utils; -import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.encoding.ASCII; public class bitfield { diff --git a/source/net/yacy/utils/cryptbig.java b/source/net/yacy/utils/cryptbig.java index 3f356df25..cb077d29f 100644 --- a/source/net/yacy/utils/cryptbig.java +++ b/source/net/yacy/utils/cryptbig.java @@ -55,7 +55,7 @@ import javax.crypto.SecretKeyFactory; import javax.crypto.spec.PBEKeySpec; import javax.crypto.spec.PBEParameterSpec; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; import net.yacy.cora.util.CommonPattern; diff --git a/source/net/yacy/utils/gzip.java b/source/net/yacy/utils/gzip.java index 3fefbbec0..11b01fef6 100644 --- a/source/net/yacy/utils/gzip.java +++ b/source/net/yacy/utils/gzip.java @@ -38,7 +38,7 @@ import java.io.OutputStream; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; -import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ConcurrentLog; diff --git a/source/net/yacy/utils/loaderThreads.java b/source/net/yacy/utils/loaderThreads.java index 574e7210f..0deaa9960 100644 --- a/source/net/yacy/utils/loaderThreads.java +++ b/source/net/yacy/utils/loaderThreads.java @@ -27,9 +27,9 @@ package net.yacy.utils; import java.util.HashMap; import java.util.Map; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.http.ProxySettings; -import net.yacy.kelondro.data.meta.DigestURI; public class loaderThreads { @@ -60,7 +60,7 @@ public class loaderThreads { this.failed = 0; } - public void newThread(final String name, final DigestURI url, final loaderProcess process, final ClientIdentification.Agent agent) { + public void newThread(final String name, final DigestURL url, final loaderProcess process, final ClientIdentification.Agent agent) { final Thread t = new loaderThread(url, process, agent); this.threads.put(name, t); t.start(); @@ -103,14 +103,14 @@ public class loaderThreads { } protected class loaderThread extends Thread { - private final DigestURI url; + private final DigestURL url; private Exception error; private final loaderProcess process; private byte[] page; private boolean loaded; final ClientIdentification.Agent agent; - public loaderThread(final DigestURI url, final loaderProcess process, final ClientIdentification.Agent agent) { + public loaderThread(final DigestURL url, final loaderProcess process, final ClientIdentification.Agent agent) { this.url = url; this.process = process; this.error = null;