diff --git a/source/de/anomic/document/parser/swfParser.java b/source/de/anomic/document/parser/swfParser.java index b64cef0a1..c216a944d 100644 --- a/source/de/anomic/document/parser/swfParser.java +++ b/source/de/anomic/document/parser/swfParser.java @@ -27,6 +27,7 @@ package de.anomic.document.parser; +import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.HashSet; @@ -80,9 +81,14 @@ public class swfParser extends AbstractParser implements Idiom { contents = swf2html.convertSWFToHTML(source); } catch (NegativeArraySizeException e) { // seen in log + return null; + } catch (IOException e) { + e.printStackTrace(); + return null; } catch (Exception e) { // we have seen a lot of OOM errors in the parser... e.printStackTrace(); + return null; } String url = null; String urlnr = null; diff --git a/source/de/anomic/kelondro/table/SplitTable.java b/source/de/anomic/kelondro/table/SplitTable.java index 69565809e..c65384ff2 100644 --- a/source/de/anomic/kelondro/table/SplitTable.java +++ b/source/de/anomic/kelondro/table/SplitTable.java @@ -283,6 +283,7 @@ public class SplitTable implements ObjectIndex { private ObjectIndex checkTable(ObjectIndex table) { // check size and age of given table; in case it is too large or too old // create a new table + assert table != null; String name = new File(table.filename()).getName(); long d; try { diff --git a/source/de/anomic/kelondro/text/DocumentIndex.java b/source/de/anomic/kelondro/text/DocumentIndex.java new file mode 100644 index 000000000..84526e665 --- /dev/null +++ b/source/de/anomic/kelondro/text/DocumentIndex.java @@ -0,0 +1,192 @@ +// DocumentIndex.java +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 14.09.2009 on http://yacy.net; +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-05-28 01:51:34 +0200 (Do, 28 Mai 2009) $ +// $LastChangedRevision: 5988 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +package de.anomic.kelondro.text; + +import java.io.File; +import java.io.IOException; +import java.util.Date; + +import de.anomic.document.Condenser; +import de.anomic.document.Document; +import de.anomic.document.Parser; +import de.anomic.document.ParserException; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.search.QueryParams; +import de.anomic.search.RankingProfile; +import de.anomic.search.ResultEntry; +import de.anomic.search.SearchEvent; +import de.anomic.search.SearchEventCache; +import de.anomic.yacy.yacyURL; +import de.anomic.yacy.logging.Log; + +/** + * convenience class to access the yacycore library from outside of yacy to put files into the index + * @author Michael Christen + * + */ +public class DocumentIndex extends Segment { + + private RankingProfile textRankingDefault = new RankingProfile(QueryParams.CONTENTDOM_TEXT); + //private Bitfield zeroConstraint = new Bitfield(4); + + public DocumentIndex(Log log, final File segmentPath) throws IOException { + super(log, segmentPath, 100000, targetFileSize * 4 - 1, false, false); + } + + public DocumentIndex(final File segmentPath) throws IOException { + this(new Log("DocumentIndex"), segmentPath); + } + + /** + * put a single file into the index + * @param file + * @return a metadata object that has been generated to identify the file + * @throws IOException in case that the file does not exist or cannot be parsed + */ + public URLMetadataRow add(File file) throws IOException { + if (file == null) throw new IOException("file = null"); + if (file.isDirectory()) throw new IOException("file should be a document, not a path"); + if (!file.canRead()) throw new IOException("cannot read file"); + yacyURL url = new yacyURL("file:" + file.getAbsolutePath()); + Document document; + try { + document = Parser.parseSource(url, null, null, file); + } catch (InterruptedException e) { + throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage()); + } catch (ParserException e) { + throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage()); + } + final Condenser condenser = new Condenser(document, true, true); + return super.storeDocument( + url, + null, + new Date(file.lastModified()), + file.length(), + document, + condenser + ); + } + + /** + * add a file or a directory of files to the index + * If the given file is a path to a directory, the complete sub-tree is indexed + * @param start + */ + public void addAll(File start) { + assert (start != null); + assert (start.canRead()); + if (!start.isDirectory()) { + try { + add(start); + } catch (IOException e) { + e.printStackTrace(); + } + return; + } + String[] s = start.list(); + File w; + for (String t: s) { + w = new File(start, t); + if (w.canRead() && ! w.isHidden()) { + if (w.isDirectory()) { + addAll(w); + } else { + try { + add(w); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + } + + /** + * do a full-text search of a given string and return a specific number of results + * @param querystring + * @param pos + * @param count + * @return a list of files that contain the given string + */ + public File[] find(String querystring, int pos, int count) { + QueryParams query = new QueryParams(querystring, 100, textRankingDefault, null); + SearchEvent se = SearchEventCache.getEvent(query, this, null, null, null, false); + File[] result = new File[count]; + ResultEntry re; + for (int i = 0; i < count; i++) { + re = se.oneResult(pos + i); + result[i] = (re == null) ? null : re.url().getLocalFile(); + } + return result; + } + + /** + * find the given string and return 20 hits + * @param querystring + * @return a list of files that contain the word + */ + public File[] find(String querystring) { + return find(querystring, 0, 20); + } + + public static void main(String[] args) { + // first argument: path to segment + // second argument: either 'add' or 'search' + // third and more arguments exists only in case that second argument is 'search': these are then the search words + // + // example: + // DocumentIndex yacyindex add test/parsertest + // DocumentIndex yacyindex search steht + System.setProperty("java.awt.headless", "true"); + if (args.length < 3) return; + File segmentPath = new File(args[0]); + System.out.println("using index files at " + segmentPath.getAbsolutePath()); + try { + if (args[1].equals("add")) { + File f = new File(args[2]); + DocumentIndex di = new DocumentIndex(segmentPath); + di.addAll(f); + di.close(); + } else { + String query = ""; + for (int i = 2; i < args.length; i++) query += args[i]; + query.trim(); + DocumentIndex di = new DocumentIndex(segmentPath); + File[] results = di.find(query); + for (File f: results) { + if (f != null) System.out.println(f.toString()); + } + di.close(); + } + } catch (IOException e) { + e.printStackTrace(); + } + //System.exit(0); + } + +} diff --git a/source/de/anomic/kelondro/text/IODispatcher.java b/source/de/anomic/kelondro/text/IODispatcher.java index 39a481b2f..4d24458a0 100644 --- a/source/de/anomic/kelondro/text/IODispatcher.java +++ b/source/de/anomic/kelondro/text/IODispatcher.java @@ -82,7 +82,7 @@ public class IODispatcher extends Thread { public synchronized void dump(ReferenceContainerCache cache, File file, ReferenceContainerArray array) { if (dumpQueue == null || controlQueue == null || !this.isAlive()) { Log.logWarning("IODispatcher", "emergency dump of file " + file.getName()); - cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize)); + if (cache.size() > 0) cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize)); } else { DumpJob job = (DumpJob)new DumpJob(cache, file, array); try { @@ -204,7 +204,7 @@ public class IODispatcher extends Thread { } public void dump() { try { - cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize)); + if (cache.size() > 0) cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize)); array.mountBLOBFile(file); } catch (IOException e) { e.printStackTrace(); diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java index 04f2924cc..451cb4ae3 100644 --- a/source/de/anomic/kelondro/text/IndexCell.java +++ b/source/de/anomic/kelondro/text/IndexCell.java @@ -283,7 +283,7 @@ public final class IndexCell extends AbstractBu * and is composed of the current date and the cell salt */ public synchronized void close() { - this.ram.dump(this.array.newContainerBLOBFile(), (int) Math.min(MemoryControl.available() / 3, writeBufferSize)); + if (this.ram.size() > 0) this.ram.dump(this.array.newContainerBLOBFile(), (int) Math.min(MemoryControl.available() / 3, writeBufferSize)); // close all this.ram.close(); this.array.close(); diff --git a/source/de/anomic/kelondro/text/Segment.java b/source/de/anomic/kelondro/text/Segment.java index e509b946b..7aefb8bca 100644 --- a/source/de/anomic/kelondro/text/Segment.java +++ b/source/de/anomic/kelondro/text/Segment.java @@ -1,5 +1,5 @@ // Segment.java -// (C) 2005-209 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// (C) 2005-2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 2005 on http://yacy.net; full redesign for segments 28.5.2009 // // This is a part of YaCy, a peer-to-peer based web search engine @@ -45,7 +45,6 @@ import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.text.navigationPrototype.NavigationReference; import de.anomic.kelondro.text.navigationPrototype.NavigationReferenceFactory; -import de.anomic.kelondro.text.navigationPrototype.NavigationReferenceRow; import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.text.referencePrototype.WordReferenceFactory; import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; @@ -54,7 +53,7 @@ import de.anomic.tools.iso639; import de.anomic.yacy.yacyURL; import de.anomic.yacy.logging.Log; -public final class Segment { +public class Segment { // environment constants public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes @@ -70,7 +69,7 @@ public final class Segment { private final Log log; protected final IndexCell termIndex; - private final IndexCell authorNavIndex; + //private final IndexCell authorNavIndex; protected final MetadataRepository urlMetadata; private final File segmentPath; private final IODispatcher merger; @@ -100,7 +99,7 @@ public final class Segment { maxFileSize, this.merger, writeBufferSize); - + /* this.authorNavIndex = new IndexCell( new File(new File(segmentPath, "nav_author"), "idx"), navigationReferenceFactory, @@ -111,7 +110,7 @@ public final class Segment { maxFileSize, this.merger, writeBufferSize); - + */ File metadatadir = new File(segmentPath, "METADATA"); if (!metadatadir.exists()) metadatadir.mkdirs(); @@ -221,14 +220,14 @@ public final class Segment { if (language == null) { // no statistics available, we take either the metadata (if given) or the TLD language = (bymetadata == null) ? url.language() : bymetadata; - System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language); + if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language); } else { if (bymetadata == null) { // two possible results: compare and report conflicts if (language.equals(url.language())) - System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language); + if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language); else { - String error = "*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")"; + String error = "LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")"; // see if we have a hint in the url that the statistic was right String u = url.toNormalform(true, false).toLowerCase(); if (!u.contains("/" + language + "/") && !u.contains("/" + iso639.country(language).toLowerCase() + "/")) { @@ -243,14 +242,14 @@ public final class Segment { } else { // here we have three results: we can do a voting if (language.equals(bymetadata)) { - //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language); + //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language); } else if (language.equals(url.language())) { - //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language); + //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language); } else if (bymetadata.equals(url.language())) { - //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")"); + //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")"); language = bymetadata; } else { - //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata."); + //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata."); language = bymetadata; } } diff --git a/source/de/anomic/search/ResultEntry.java b/source/de/anomic/search/ResultEntry.java index 4197276e2..2de572de9 100644 --- a/source/de/anomic/search/ResultEntry.java +++ b/source/de/anomic/search/ResultEntry.java @@ -69,7 +69,7 @@ public class ResultEntry { this.dbRetrievalTime = dbRetrievalTime; this.snippetComputationTime = snippetComputationTime; final String host = urlcomps.url().getHost(); - if (host.endsWith(".yacyh")) { + if (host != null && host.endsWith(".yacyh")) { // translate host into current IP int p = host.indexOf("."); final String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6)); diff --git a/source/de/anomic/search/SearchEventCache.java b/source/de/anomic/search/SearchEventCache.java index 3d1c1d53a..ee0947eda 100644 --- a/source/de/anomic/search/SearchEventCache.java +++ b/source/de/anomic/search/SearchEventCache.java @@ -75,7 +75,7 @@ public class SearchEventCache { String id = query.id(false); SearchEvent event = SearchEventCache.lastEvents.get(id); - if (Switchboard.getSwitchboard().crawlQueues.noticeURL.size() > 0 && event != null && System.currentTimeMillis() - event.getEventTime() > 60000) { + if (Switchboard.getSwitchboard() != null && Switchboard.getSwitchboard().crawlQueues.noticeURL.size() > 0 && event != null && System.currentTimeMillis() - event.getEventTime() > 60000) { // if a local crawl is ongoing, don't use the result from the cache to use possibly more results that come from the current crawl // to prevent that this happens during a person switches between the different result pages, a re-search happens no more than // once a minute diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 6a8be4eb8..214208fdc 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -208,9 +208,9 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi public static long lastPPMUpdate = System.currentTimeMillis()- 30000; // colored list management - public static TreeSet badwords = null; + public static TreeSet badwords = new TreeSet(); + public static TreeSet stopwords = new TreeSet(); public static TreeSet blueList = null; - public static TreeSet stopwords = null; public static TreeSet badwordHashes = null; public static TreeSet blueListHashes = null; public static TreeSet stopwordHashes = null; diff --git a/source/de/anomic/server/serverDomains.java b/source/de/anomic/server/serverDomains.java index 3b036b668..7d876b133 100644 --- a/source/de/anomic/server/serverDomains.java +++ b/source/de/anomic/server/serverDomains.java @@ -530,6 +530,7 @@ public class serverDomains { } public static int getDomainID(final String host) { + if (host == null) return TLD_Local_ID; final int p = host.lastIndexOf('.'); String tld = ""; if (p > 0) { diff --git a/source/de/anomic/tools/Punycode.java b/source/de/anomic/tools/Punycode.java index 8b3916729..60227b23e 100644 --- a/source/de/anomic/tools/Punycode.java +++ b/source/de/anomic/tools/Punycode.java @@ -223,6 +223,7 @@ public class Punycode // the following method has been added by Michael Christen public static boolean isBasic(final String input) { + if (input == null) return true; for (int j = 0; j < input.length(); j++) { if (!isBasic(input.charAt(j))) return false; } diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index fa8421326..00dde31c6 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -67,8 +67,12 @@ public class yacyURL implements Serializable { return (url == null) ? null : url.hash().substring(6); } + public yacyURL(final File file) throws MalformedURLException { + this("file", "", -1, file.getAbsolutePath()); + } + public yacyURL(final String url) throws MalformedURLException { - this(url, null); + this(url, null); } public yacyURL(final String url, final String hash) throws MalformedURLException { @@ -89,7 +93,7 @@ public class yacyURL implements Serializable { } this.protocol = url.substring(0, p).toLowerCase().trim(); if (url.length() < p + 4) throw new MalformedURLException("URL not parseable: '" + url + "'"); - if (url.substring(p + 1, p + 3).equals("//")) { + if (!this.protocol.equals("file") && url.substring(p + 1, p + 3).equals("//")) { // identify host, userInfo and file for http and ftp protocol final int q = url.indexOf('/', p + 3); int r; @@ -112,7 +116,7 @@ public class yacyURL implements Serializable { } path = url.substring(q); } - if (host.length() < 4) throw new MalformedURLException("host too short: '" + host + "'"); + if (host.length() < 4 && !protocol.equals("file")) throw new MalformedURLException("host too short: '" + host + "'"); if (host.indexOf('&') >= 0) throw new MalformedURLException("invalid '&' in host"); path = resolveBackpath(path); identPort(url, (protocol.equals("http") ? 80 : ((protocol.equals("https")) ? 443 : ((protocol.equals("ftp")) ? 21 : -1)))); @@ -133,6 +137,46 @@ public class yacyURL implements Serializable { port = -1; quest = null; ref = null; + } if (protocol.equals("file")) { + // parse file url + String h = url.substring(p + 1); + if (h.startsWith("//")) { + // host may be given, but may be also empty + final int q = h.indexOf('/', 2); + if (q <= 0) { + // no host given + host = null; + path = h.substring(2); + } else { + host = h.substring(2, q); + if (host.length() == 0 || host.equals("localhost")) host = null; + h = h.substring(q); + char c = h.charAt(2); + if (c == ':' || c == '|') + path = h.substring(1); + else + path = h; + } + } else { + host = null; + if (h.startsWith("/")) { + char c = h.charAt(2); + if (c == ':' || c == '|') + path = h.substring(1); + else + path = h; + } else { + char c = h.charAt(1); + if (c == ':' || c == '|') + path = h; + else + path = "/" + h; + } + } + userInfo = null; + port = -1; + quest = null; + ref = null; } else { throw new MalformedURLException("unknown protocol: " + url); } @@ -144,24 +188,20 @@ public class yacyURL implements Serializable { StringBuilder buffer = new StringBuilder(); // encode each domainpart seperately for(int i=0; i= 0) && (host != null)) ? host + ":" + port : ((host != null) ? host : ""); } @@ -562,7 +616,7 @@ public class yacyURL implements Serializable { } public void removeRef() { - ref = null; + ref = null; } public String getUserInfo() { @@ -596,13 +650,16 @@ public class yacyURL implements Serializable { if (this.port < 0 || this.port == 21) { defaultPort = true; } } else if (this.protocol.equals("https")) { if (this.port < 0 || this.port == 443) { defaultPort = true; } + } else if (this.protocol.equals("file")) { + defaultPort = true; } final String path = this.getFile(includeReference); if (defaultPort) { - return this.protocol + "://" + - ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + - this.getHost().toLowerCase() + path; + return + this.protocol + ":" + + ((this.getHost() == null) ? "" : "//" + ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + this.getHost().toLowerCase()) + + path; } return this.protocol + "://" + ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + @@ -610,78 +667,78 @@ public class yacyURL implements Serializable { } /* (non-Javadoc) - * @see java.lang.Object#hashCode() - */ - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + ((host == null) ? 0 : host.hashCode()); - result = prime * result + ((path == null) ? 0 : path.hashCode()); - result = prime * result + port; - result = prime * result - + ((protocol == null) ? 0 : protocol.hashCode()); - result = prime * result + ((quest == null) ? 0 : quest.hashCode()); - result = prime * result + ((ref == null) ? 0 : ref.hashCode()); - result = prime * result - + ((userInfo == null) ? 0 : userInfo.hashCode()); - return result; - } - - /* (non-Javadoc) - * @see java.lang.Object#equals(java.lang.Object) - */ - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (!(obj instanceof yacyURL)) - return false; - yacyURL other = (yacyURL) obj; - if (host == null) { - if (other.host != null) - return false; - } else if (!host.equals(other.host)) - return false; - if (path == null) { - if (other.path != null) - return false; - } else if (!path.equals(other.path)) - return false; - if (port != other.port) - return false; - if (protocol == null) { - if (other.protocol != null) - return false; - } else if (!protocol.equals(other.protocol)) - return false; - if (quest == null) { - if (other.quest != null) - return false; - } else if (!quest.equals(other.quest)) - return false; - if (ref == null) { - if (other.ref != null) - return false; - } else if (!ref.equals(other.ref)) - return false; - if (userInfo == null) { - if (other.userInfo != null) - return false; - } else if (!userInfo.equals(other.userInfo)) - return false; - return true; - } - - public int compareTo(final Object h) { + * @see java.lang.Object#hashCode() + */ + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((host == null) ? 0 : host.hashCode()); + result = prime * result + ((path == null) ? 0 : path.hashCode()); + result = prime * result + port; + result = prime * result + + ((protocol == null) ? 0 : protocol.hashCode()); + result = prime * result + ((quest == null) ? 0 : quest.hashCode()); + result = prime * result + ((ref == null) ? 0 : ref.hashCode()); + result = prime * result + + ((userInfo == null) ? 0 : userInfo.hashCode()); + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (!(obj instanceof yacyURL)) + return false; + yacyURL other = (yacyURL) obj; + if (host == null) { + if (other.host != null) + return false; + } else if (!host.equals(other.host)) + return false; + if (path == null) { + if (other.path != null) + return false; + } else if (!path.equals(other.path)) + return false; + if (port != other.port) + return false; + if (protocol == null) { + if (other.protocol != null) + return false; + } else if (!protocol.equals(other.protocol)) + return false; + if (quest == null) { + if (other.quest != null) + return false; + } else if (!quest.equals(other.quest)) + return false; + if (ref == null) { + if (other.ref != null) + return false; + } else if (!ref.equals(other.ref)) + return false; + if (userInfo == null) { + if (other.userInfo != null) + return false; + } else if (!userInfo.equals(other.userInfo)) + return false; + return true; + } + + public int compareTo(final Object h) { assert (h instanceof yacyURL); return this.toString().compareTo(((yacyURL) h).toString()); } public boolean isPOST() { - return (this.quest != null) && (this.quest.length() > 0); + return (this.quest != null) && (this.quest.length() > 0); } public final boolean isCGI() { @@ -741,7 +798,7 @@ public class yacyURL implements Serializable { final int id = serverDomains.getDomainID(this.host); // id=7: tld is local final boolean isHTTP = this.protocol.equals("http"); - int p = this.host.lastIndexOf('.'); + int p = (host == null) ? -1 : this.host.lastIndexOf('.'); String dom = (p > 0) ? dom = host.substring(0, p) : ""; p = dom.lastIndexOf('.'); // locate subdomain String subdom = ""; @@ -797,7 +854,7 @@ public class yacyURL implements Serializable { } private static final String hosthash5(final String protocol, final String host, final int port) { - return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5); + return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(protocol + ((host == null) ? "" : (":" + host + ":" + port)))).substring(0, 5); } /** @@ -902,7 +959,7 @@ public class yacyURL implements Serializable { if (this.hash == null) { if (this.host.startsWith("127.") || this.host.equals("localhost") || this.host.startsWith("0:0:0:0:0:0:0:1")) return true; synchronized (this) { - if (this.hash == null) this.hash = urlHashComputation(); + if (this.hash == null) this.hash = urlHashComputation(); } } //if (domDomain(this.hash) != 7) System.out.println("*** DEBUG - not local: " + this.toNormalform(true, false)); @@ -916,6 +973,7 @@ public class yacyURL implements Serializable { // language calculation public final String language() { String language = "en"; + if (host == null) return language; final int pos = host.lastIndexOf("."); if (pos > 0 && host.length() - pos == 3) language = host.substring(pos + 1).toLowerCase(); if (language.equals("uk")) language = "en"; @@ -924,36 +982,42 @@ public class yacyURL implements Serializable { public static void main(final String[] args) { final String[][] test = new String[][]{ + new String[]{null, "file://C:WINDOWS\\CMD0.EXE"}, + new String[]{null, "file:/bin/yacy1"}, // file:/// may have many '/' if the host is omitted and the path starts with '/' + new String[]{null, "file:///bin/yacy2"}, // file:/// may have many '/' if the host is omitted and the path starts with '/' + new String[]{null, "file:C:WINDOWS\\CMD.EXE"}, + new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"}, + new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"}, new String[]{null, "http://www.anomic.de/test/"}, new String[]{null, "http://www.anomic.de/"}, new String[]{null, "http://www.anomic.de"}, new String[]{null, "http://www.anomic.de/home/test?x=1#home"}, new String[]{null, "http://www.anomic.de/home/test?x=1"}, - new String[]{null, "http://www.anomic.de/home/test#home"}, - new String[]{null, "ftp://ftp.anomic.de/home/test#home"}, - new String[]{null, "http://www.anomic.de/home/../abc/"}, - new String[]{null, "mailto:abcdefg@nomailnomail.com"}, - new String[]{"http://www.anomic.de/home", "test"}, - new String[]{"http://www.anomic.de/home", "test/"}, - new String[]{"http://www.anomic.de/home/", "test"}, - new String[]{"http://www.anomic.de/home/", "test/"}, - new String[]{"http://www.anomic.de/home/index.html", "test.htm"}, - new String[]{"http://www.anomic.de/home/index.html", "http://www.yacy.net/test"}, - new String[]{"http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test"}, - new String[]{"http://www.anomic.de/home/index.html", "../test"}, - new String[]{"http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com"}, - new String[]{null, "news:de.test"}, - new String[]{"http://www.anomic.de/home", "news:de.test"}, - new String[]{null, "mailto:bob@web.com"}, - new String[]{"http://www.anomic.de/home", "mailto:bob@web.com"}, - new String[]{"http://www.anomic.de/home", "ftp://ftp.anomic.de/src"}, - new String[]{null, "ftp://ftp.delegate.org/"}, - new String[]{"http://www.anomic.de/home", "ftp://ftp.delegate.org/"}, - new String[]{"http://www.anomic.de","mailto:yacy@weltherrschaft.org"}, - new String[]{"http://www.anomic.de","javascipt:temp"}, - new String[]{null,"http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history"}, - new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"}, - new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"} + new String[]{null, "http://www.anomic.de/home/test#home"}, + new String[]{null, "ftp://ftp.anomic.de/home/test#home"}, + new String[]{null, "http://www.anomic.de/home/../abc/"}, + new String[]{null, "mailto:abcdefg@nomailnomail.com"}, + new String[]{"http://www.anomic.de/home", "test"}, + new String[]{"http://www.anomic.de/home", "test/"}, + new String[]{"http://www.anomic.de/home/", "test"}, + new String[]{"http://www.anomic.de/home/", "test/"}, + new String[]{"http://www.anomic.de/home/index.html", "test.htm"}, + new String[]{"http://www.anomic.de/home/index.html", "http://www.yacy.net/test"}, + new String[]{"http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test"}, + new String[]{"http://www.anomic.de/home/index.html", "../test"}, + new String[]{"http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com"}, + new String[]{null, "news:de.test"}, + new String[]{"http://www.anomic.de/home", "news:de.test"}, + new String[]{null, "mailto:bob@web.com"}, + new String[]{"http://www.anomic.de/home", "mailto:bob@web.com"}, + new String[]{"http://www.anomic.de/home", "ftp://ftp.anomic.de/src"}, + new String[]{null, "ftp://ftp.delegate.org/"}, + new String[]{"http://www.anomic.de/home", "ftp://ftp.delegate.org/"}, + new String[]{"http://www.anomic.de","mailto:yacy@weltherrschaft.org"}, + new String[]{"http://www.anomic.de","javascipt:temp"}, + new String[]{null,"http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history"}, + new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"}, + new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"} }; String environment, url; yacyURL aURL, aURL1; @@ -961,7 +1025,7 @@ public class yacyURL implements Serializable { for (int i = 0; i < test.length; i++) { environment = test[i][0]; url = test[i][1]; - try {aURL = yacyURL.newURL(environment, url);} catch (final MalformedURLException e) {aURL = null;} + try {aURL = yacyURL.newURL(environment, url);} catch (final MalformedURLException e) {e.printStackTrace(); aURL = null;} if (aURL != null) System.out.println("normalized: " + aURL.toNormalform(true, true)); if (environment == null) { try {jURL = new java.net.URL(url);} catch (final MalformedURLException e) {jURL = null;}