added a convenience class to add files into a YaCy index

to make this possible, the yacyURL must be able to process file:// urls, which has also been implemented testing of the new class resulted in some bugfixes in other classes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6313 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 68465c37af
parent 2e41e10ffd
commit 68465c37af
12 changed files with 403 additions and 139 deletions
--- a/source/de/anomic/document/parser/swfParser.java
+++ b/source/de/anomic/document/parser/swfParser.java
@ -27,6 +27,7 @@

 package de.anomic.document.parser;

+import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;
 import java.util.HashSet;
@ -80,9 +81,14 @@ public class swfParser extends AbstractParser implements Idiom {
            	contents = swf2html.convertSWFToHTML(source);
            } catch (NegativeArraySizeException e) {
                // seen in log
+                return null;
+            } catch (IOException e) {
+                e.printStackTrace();
+                return null;
            } catch (Exception e) {
            	// we have seen a lot of OOM errors in the parser...
            	e.printStackTrace();
+            	return null;
            }
            String url = null;
            String urlnr = null;
--- a/source/de/anomic/kelondro/table/SplitTable.java
+++ b/source/de/anomic/kelondro/table/SplitTable.java
@ -283,6 +283,7 @@ public class SplitTable implements ObjectIndex {
    private ObjectIndex checkTable(ObjectIndex table) {
        // check size and age of given table; in case it is too large or too old
        // create a new table
+        assert table != null;
        String name = new File(table.filename()).getName();
        long d;
        try {
--- a/source/de/anomic/kelondro/text/DocumentIndex.java
+++ b/source/de/anomic/kelondro/text/DocumentIndex.java
@ -0,0 +1,192 @@
+// DocumentIndex.java
+// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 14.09.2009 on http://yacy.net;
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2009-05-28 01:51:34 +0200 (Do, 28 Mai 2009) $
+// $LastChangedRevision: 5988 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+package de.anomic.kelondro.text;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Date;
+
+import de.anomic.document.Condenser;
+import de.anomic.document.Document;
+import de.anomic.document.Parser;
+import de.anomic.document.ParserException;
+import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
+import de.anomic.search.QueryParams;
+import de.anomic.search.RankingProfile;
+import de.anomic.search.ResultEntry;
+import de.anomic.search.SearchEvent;
+import de.anomic.search.SearchEventCache;
+import de.anomic.yacy.yacyURL;
+import de.anomic.yacy.logging.Log;
+
+/**
+ * convenience class to access the yacycore library from outside of yacy to put files into the index
+ * @author Michael Christen
+ *
+ */
+public class DocumentIndex extends Segment {
+	
+    private RankingProfile textRankingDefault = new RankingProfile(QueryParams.CONTENTDOM_TEXT);
+    //private Bitfield zeroConstraint = new Bitfield(4);
+    
+    public DocumentIndex(Log log, final File segmentPath) throws IOException {
+        super(log, segmentPath, 100000, targetFileSize * 4 - 1, false, false);
+    }
+    
+    public DocumentIndex(final File segmentPath) throws IOException {
+        this(new Log("DocumentIndex"), segmentPath);
+    }
+	
+    /**
+     * put a single file into the index
+     * @param file
+     * @return a metadata object that has been generated to identify the file
+     * @throws IOException in case that the file does not exist or cannot be parsed
+     */
+    public URLMetadataRow add(File file) throws IOException {
+        if (file == null) throw new IOException("file = null");
+        if (file.isDirectory()) throw new IOException("file should be a document, not a path");
+        if (!file.canRead()) throw new IOException("cannot read file");
+    	yacyURL url = new yacyURL("file:" + file.getAbsolutePath());
+    	Document document;
+        try {
+            document = Parser.parseSource(url, null, null, file);
+        } catch (InterruptedException e) {
+            throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage());
+        } catch (ParserException e) {
+            throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage());
+        }
+        final Condenser condenser = new Condenser(document, true, true);
+    	return super.storeDocument(
+                url,
+                null,
+                new Date(file.lastModified()),
+                file.length(),
+                document,
+                condenser
+                );
+    }
+    
+    /**
+     * add a file or a directory of files to the index
+     * If the given file is a path to a directory, the complete sub-tree is indexed
+     * @param start
+     */
+    public void addAll(File start) {
+        assert (start != null);
+        assert (start.canRead());
+        if (!start.isDirectory()) {
+            try {
+                add(start);
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+            return;
+        }
+        String[] s = start.list();
+        File w;
+        for (String t: s) {
+            w = new File(start, t);
+            if (w.canRead() && ! w.isHidden()) {
+                if (w.isDirectory()) {
+                    addAll(w);
+                } else {
+                    try {
+                        add(w);
+                    } catch (IOException e) {
+                        e.printStackTrace();
+                    }
+                }
+            }
+        }
+    }
+    
+    /**
+     * do a full-text search of a given string and return a specific number of results
+     * @param querystring
+     * @param pos
+     * @param count
+     * @return a list of files that contain the given string
+     */
+    public File[] find(String querystring, int pos, int count) {
+        QueryParams query = new QueryParams(querystring, 100, textRankingDefault, null);
+        SearchEvent se = SearchEventCache.getEvent(query, this, null, null, null, false);
+        File[] result = new File[count];
+        ResultEntry re;
+        for (int i = 0; i < count; i++) {
+            re = se.oneResult(pos + i);
+            result[i] = (re == null) ? null : re.url().getLocalFile();
+        }
+        return result;
+    }
+    
+    /**
+     * find the given string and return 20 hits
+     * @param querystring
+     * @return a list of files that contain the word
+     */
+    public File[] find(String querystring) {
+        return find(querystring, 0, 20);
+    }
+    
+    public static void main(String[] args) {
+        // first argument: path to segment
+        // second argument: either 'add' or 'search'
+        // third and more arguments exists only in case that second argument is 'search': these are then the search words
+        //
+        // example:
+        // DocumentIndex yacyindex add test/parsertest
+        // DocumentIndex yacyindex search steht
+        System.setProperty("java.awt.headless", "true");
+        if (args.length < 3) return;
+        File segmentPath = new File(args[0]);
+        System.out.println("using index files at " + segmentPath.getAbsolutePath());
+        try {
+            if (args[1].equals("add")) {
+                File f = new File(args[2]);
+                DocumentIndex di = new DocumentIndex(segmentPath);
+                di.addAll(f);
+                di.close();
+            } else {
+                String query = "";
+                for (int i = 2; i < args.length; i++) query += args[i];
+                query.trim();
+                DocumentIndex di = new DocumentIndex(segmentPath);
+                File[] results = di.find(query);
+                for (File f: results) {
+                    if (f != null) System.out.println(f.toString());
+                }
+                di.close();
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        //System.exit(0);
+    }
+    
+}
--- a/source/de/anomic/kelondro/text/IODispatcher.java
+++ b/source/de/anomic/kelondro/text/IODispatcher.java
@ -82,7 +82,7 @@ public class IODispatcher extends Thread {
    public synchronized void dump(ReferenceContainerCache<? extends Reference> cache, File file, ReferenceContainerArray<? extends Reference> array) {
        if (dumpQueue == null || controlQueue == null || !this.isAlive()) {
            Log.logWarning("IODispatcher", "emergency dump of file " + file.getName());
-            cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
+             if (cache.size() > 0) cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
        } else {
            DumpJob<? extends Reference> job = (DumpJob<? extends Reference>)new DumpJob(cache, file, array);
            try {
@ -204,7 +204,7 @@ public class IODispatcher extends Thread {
        }
        public void dump() {
            try {
-                cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
+                if (cache.size() > 0) cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
                array.mountBLOBFile(file);
            } catch (IOException e) {
                e.printStackTrace();
--- a/source/de/anomic/kelondro/text/IndexCell.java
+++ b/source/de/anomic/kelondro/text/IndexCell.java
@ -283,7 +283,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
     * and is composed of the current date and the cell salt
     */
    public synchronized void close() {
-        this.ram.dump(this.array.newContainerBLOBFile(), (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
+        if (this.ram.size() > 0) this.ram.dump(this.array.newContainerBLOBFile(), (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
        // close all
        this.ram.close();
        this.array.close();
--- a/source/de/anomic/kelondro/text/Segment.java
+++ b/source/de/anomic/kelondro/text/Segment.java
@ -1,5 +1,5 @@
 // Segment.java
-// (C) 2005-209 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// (C) 2005-2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
 // first published 2005 on http://yacy.net; full redesign for segments 28.5.2009
 //
 // This is a part of YaCy, a peer-to-peer based web search engine
@ -45,7 +45,6 @@ import de.anomic.kelondro.order.ByteOrder;
 import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
 import de.anomic.kelondro.text.navigationPrototype.NavigationReference;
 import de.anomic.kelondro.text.navigationPrototype.NavigationReferenceFactory;
-import de.anomic.kelondro.text.navigationPrototype.NavigationReferenceRow;
 import de.anomic.kelondro.text.referencePrototype.WordReference;
 import de.anomic.kelondro.text.referencePrototype.WordReferenceFactory;
 import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
@ -54,7 +53,7 @@ import de.anomic.tools.iso639;
 import de.anomic.yacy.yacyURL;
 import de.anomic.yacy.logging.Log;

-public final class Segment {
+public class Segment {

    // environment constants
    public static final long wCacheMaxAge    = 1000 * 60 * 30; // milliseconds; 30 minutes
@ -70,7 +69,7 @@ public final class Segment {
    
    private   final Log                            log;
    protected final IndexCell<WordReference>       termIndex;
-    private   final IndexCell<NavigationReference> authorNavIndex;
+    //private   final IndexCell<NavigationReference> authorNavIndex;
    protected final MetadataRepository             urlMetadata;
    private   final File                           segmentPath;
    private   final IODispatcher                   merger;
@ -100,7 +99,7 @@ public final class Segment {
                maxFileSize,
                this.merger,
                writeBufferSize);
-        
+        /*
        this.authorNavIndex = new IndexCell<NavigationReference>(
                new File(new File(segmentPath, "nav_author"), "idx"),
                navigationReferenceFactory,
@ -111,7 +110,7 @@ public final class Segment {
                maxFileSize,
                this.merger,
                writeBufferSize);
-        
+        */
        File metadatadir = new File(segmentPath, "METADATA");
        if (!metadatadir.exists()) metadatadir.mkdirs();
        
@ -221,14 +220,14 @@ public final class Segment {
        if (language == null) {
            // no statistics available, we take either the metadata (if given) or the TLD
            language = (bymetadata == null) ? url.language() : bymetadata;
-            System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
+            if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
        } else {
            if (bymetadata == null) {
                // two possible results: compare and report conflicts
                if (language.equals(url.language()))
-                    System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language);
+                    if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language);
                else {
-                    String error = "*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")";
+                    String error = "LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")";
                    // see if we have a hint in the url that the statistic was right
                    String u = url.toNormalform(true, false).toLowerCase();
                    if (!u.contains("/" + language + "/") && !u.contains("/" + iso639.country(language).toLowerCase() + "/")) {
@ -243,14 +242,14 @@ public final class Segment {
            } else {
                // here we have three results: we can do a voting
                if (language.equals(bymetadata)) {
-                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
+                    //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
                } else if (language.equals(url.language())) {
-                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
+                    //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
                } else if (bymetadata.equals(url.language())) {
-                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
+                    //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
                    language = bymetadata;
                } else {
-                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
+                    //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
                    language = bymetadata;
                }
            }
--- a/source/de/anomic/search/ResultEntry.java
+++ b/source/de/anomic/search/ResultEntry.java
@ -69,7 +69,7 @@ public class ResultEntry {
        this.dbRetrievalTime = dbRetrievalTime;
        this.snippetComputationTime = snippetComputationTime;
        final String host = urlcomps.url().getHost();
-        if (host.endsWith(".yacyh")) {
+        if (host != null && host.endsWith(".yacyh")) {
            // translate host into current IP
            int p = host.indexOf(".");
            final String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6));
--- a/source/de/anomic/search/SearchEventCache.java
+++ b/source/de/anomic/search/SearchEventCache.java
@ -75,7 +75,7 @@ public class SearchEventCache {
        
        String id = query.id(false);
        SearchEvent event = SearchEventCache.lastEvents.get(id);
-        if (Switchboard.getSwitchboard().crawlQueues.noticeURL.size() > 0 && event != null && System.currentTimeMillis() - event.getEventTime() > 60000) {
+        if (Switchboard.getSwitchboard() != null && Switchboard.getSwitchboard().crawlQueues.noticeURL.size() > 0 && event != null && System.currentTimeMillis() - event.getEventTime() > 60000) {
            // if a local crawl is ongoing, don't use the result from the cache to use possibly more results that come from the current crawl
            // to prevent that this happens during a person switches between the different result pages, a re-search happens no more than
            // once a minute
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -208,9 +208,9 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
    public  static long lastPPMUpdate        = System.currentTimeMillis()- 30000;

    // colored list management
-    public static TreeSet<String> badwords       = null;
+    public static TreeSet<String> badwords       = new TreeSet<String>();
+    public static TreeSet<String> stopwords      = new TreeSet<String>();    
    public static TreeSet<String> blueList       = null;
-    public static TreeSet<String> stopwords      = null;    
    public static TreeSet<byte[]> badwordHashes  = null;
    public static TreeSet<byte[]> blueListHashes = null;
    public static TreeSet<byte[]> stopwordHashes = null;    
--- a/source/de/anomic/server/serverDomains.java
+++ b/source/de/anomic/server/serverDomains.java
@ -530,6 +530,7 @@ public class serverDomains {
    }
    
    public static int getDomainID(final String host) {
+        if (host == null) return TLD_Local_ID;
        final int p = host.lastIndexOf('.');
        String tld = "";
        if (p > 0) {
--- a/source/de/anomic/tools/Punycode.java
+++ b/source/de/anomic/tools/Punycode.java
@ -223,6 +223,7 @@ public class Punycode

  // the following method has been added by Michael Christen
  public static boolean isBasic(final String input) {
+      if (input == null) return true;
      for (int j = 0; j < input.length(); j++) {
          if (!isBasic(input.charAt(j))) return false;
      }
--- a/source/de/anomic/yacy/yacyURL.java
+++ b/source/de/anomic/yacy/yacyURL.java
@ -67,8 +67,12 @@ public class yacyURL implements Serializable {
        return (url == null) ? null : url.hash().substring(6);
    }
    
+    public yacyURL(final File file) throws MalformedURLException {
+        this("file", "", -1, file.getAbsolutePath());
+    }
+
    public yacyURL(final String url) throws MalformedURLException {
-	this(url, null);
+        this(url, null);
    }
    
    public yacyURL(final String url, final String hash) throws MalformedURLException {
@ -89,7 +93,7 @@ public class yacyURL implements Serializable {
        }
        this.protocol = url.substring(0, p).toLowerCase().trim();
        if (url.length() < p + 4) throw new MalformedURLException("URL not parseable: '" + url + "'");
-        if (url.substring(p + 1, p + 3).equals("//")) {
+        if (!this.protocol.equals("file") && url.substring(p + 1, p + 3).equals("//")) {
            // identify host, userInfo and file for http and ftp protocol
            final int q = url.indexOf('/', p + 3);
            int r;
@ -112,7 +116,7 @@ public class yacyURL implements Serializable {
                }
                path = url.substring(q);
            }
-            if (host.length() < 4) throw new MalformedURLException("host too short: '" + host + "'");
+            if (host.length() < 4 && !protocol.equals("file")) throw new MalformedURLException("host too short: '" + host + "'");
            if (host.indexOf('&') >= 0) throw new MalformedURLException("invalid '&' in host");
            path = resolveBackpath(path);
            identPort(url, (protocol.equals("http") ? 80 : ((protocol.equals("https")) ? 443 : ((protocol.equals("ftp")) ? 21 : -1))));
@ -133,6 +137,46 @@ public class yacyURL implements Serializable {
                port = -1;
                quest = null;
                ref = null;
+            } if (protocol.equals("file")) {
+                // parse file url
+                String h = url.substring(p + 1);
+                if (h.startsWith("//")) {
+                    // host may be given, but may be also empty
+                    final int q = h.indexOf('/', 2);
+                    if (q <= 0) {
+                        // no host given
+                        host = null;
+                        path = h.substring(2);
+                    } else {
+                        host = h.substring(2, q);
+                        if (host.length() == 0 || host.equals("localhost")) host = null;
+                        h = h.substring(q);
+                        char c = h.charAt(2);
+                        if (c == ':' || c == '|')
+                            path = h.substring(1);
+                        else
+                            path = h;
+                    }
+                } else {
+                    host = null;
+                    if (h.startsWith("/")) {
+                        char c = h.charAt(2);
+                        if (c == ':' || c == '|')
+                            path = h.substring(1);
+                        else
+                            path = h;
+                    } else {
+                        char c = h.charAt(1);
+                        if (c == ':' || c == '|')
+                            path = h;
+                        else
+                            path = "/" + h;
+                    }
+                }
+                userInfo = null;
+                port = -1;
+                quest = null;
+                ref = null;
            } else {
                throw new MalformedURLException("unknown protocol: " + url);
            }
@ -144,24 +188,20 @@ public class yacyURL implements Serializable {
            StringBuilder buffer = new StringBuilder();
            // encode each domainpart seperately
            for(int i=0; i<domainParts.length; i++) {
-        	final String part = domainParts[i];
-        	if(!Punycode.isBasic(part)) {
-        	    buffer.append("xn--" + Punycode.encode(part));
-        	} else {
-        	    buffer.append(part);
-        	}
-        	if(i != domainParts.length-1) {
-        	    buffer.append('.');
-        	}
+            final String part = domainParts[i];
+            if(!Punycode.isBasic(part)) {
+                buffer.append("xn--" + Punycode.encode(part));
+            } else {
+                buffer.append(part);
+            }
+            if(i != domainParts.length-1) {
+                buffer.append('.');
+            }
            }
            host = buffer.toString();
        } catch (final PunycodeException e) {}
    }

-    public yacyURL(final File file) throws MalformedURLException {
-        this("file", "", -1, file.getAbsolutePath());
-    }
-
    public static yacyURL newURL(final String baseURL, final String relPath) throws MalformedURLException {
        if ((baseURL == null) ||
            (relPath.startsWith("http://")) ||
@ -212,8 +252,8 @@ public class yacyURL implements Serializable {
                (relPath.startsWith("smb://"))) {
            this.path = baseURL.path;
        } else if (relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()) { // discards also any unknown protocol from previous if
-    		throw new MalformedURLException("relative path malformed: " + relPath);
-    	} else if (relPath.startsWith("/")) {
+            throw new MalformedURLException("relative path malformed: " + relPath);
+        } else if (relPath.startsWith("/")) {
            this.path = relPath;
        } else if (baseURL.path.endsWith("/")) {
            if (relPath.startsWith("#") || relPath.startsWith("?")) {
@ -315,7 +355,7 @@ public class yacyURL implements Serializable {
                qtmp.append('=');
                qtmp.append(escape(questp[i].substring(questp[i].indexOf('=') + 1)));
            } else {
-            	qtmp.append('&');
+                qtmp.append('&');
                qtmp.append(escape(questp[i]));
            }
        }
@ -541,6 +581,20 @@ public class yacyURL implements Serializable {
        return path;
    }

+    /**
+     * return the file object to a local file
+     * this patches also 'strange' windows file paths
+     * @return the file as absolute path
+     */
+    public File getLocalFile() {
+        char c = path.charAt(1);
+        if (c == ':') return new File(path.replace('/', '\\'));
+        if (c == '|') return new File(path.charAt(0) + ":" + path.substring(2).replace('/', '\\'));
+        c = path.charAt(2);
+        if (c == ':' || c == '|') return new File(path.charAt(1) + ":" + path.substring(3).replace('/', '\\'));
+        return new File(path);
+    }
+
    public String getAuthority() {
        return ((port >= 0) && (host != null)) ? host + ":" + port : ((host != null) ? host : "");
    }
@ -562,7 +616,7 @@ public class yacyURL implements Serializable {
    }

    public void removeRef() {
-    	ref = null;
+        ref = null;
    }
    
    public String getUserInfo() {
@ -596,13 +650,16 @@ public class yacyURL implements Serializable {
            if (this.port < 0 || this.port == 21)  { defaultPort = true; }
        } else if (this.protocol.equals("https")) {
            if (this.port < 0 || this.port == 443) { defaultPort = true; }
+        } else if (this.protocol.equals("file")) {
+            defaultPort = true;
        }
        final String path = this.getFile(includeReference);
        
        if (defaultPort) {
-            return this.protocol + "://" +
-                   ((this.userInfo != null) ? (this.userInfo + "@") : ("")) +
-                   this.getHost().toLowerCase() + path;
+            return
+              this.protocol + ":" +
+              ((this.getHost() == null) ? "" : "//" + ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + this.getHost().toLowerCase()) +
+              path;
        }
        return this.protocol + "://" +
               ((this.userInfo != null) ? (this.userInfo + "@") : ("")) +
@ -610,78 +667,78 @@ public class yacyURL implements Serializable {
    }
    
    /* (non-Javadoc)
-	 * @see java.lang.Object#hashCode()
-	 */
-	@Override
-	public int hashCode() {
-		final int prime = 31;
-		int result = 1;
-		result = prime * result + ((host == null) ? 0 : host.hashCode());
-		result = prime * result + ((path == null) ? 0 : path.hashCode());
-		result = prime * result + port;
-		result = prime * result
-				+ ((protocol == null) ? 0 : protocol.hashCode());
-		result = prime * result + ((quest == null) ? 0 : quest.hashCode());
-		result = prime * result + ((ref == null) ? 0 : ref.hashCode());
-		result = prime * result
-				+ ((userInfo == null) ? 0 : userInfo.hashCode());
-		return result;
-	}
-
-	/* (non-Javadoc)
-	 * @see java.lang.Object#equals(java.lang.Object)
-	 */
-	@Override
-	public boolean equals(Object obj) {
-		if (this == obj)
-			return true;
-		if (obj == null)
-			return false;
-		if (!(obj instanceof yacyURL))
-			return false;
-		yacyURL other = (yacyURL) obj;
-		if (host == null) {
-			if (other.host != null)
-				return false;
-		} else if (!host.equals(other.host))
-			return false;
-		if (path == null) {
-			if (other.path != null)
-				return false;
-		} else if (!path.equals(other.path))
-			return false;
-		if (port != other.port)
-			return false;
-		if (protocol == null) {
-			if (other.protocol != null)
-				return false;
-		} else if (!protocol.equals(other.protocol))
-			return false;
-		if (quest == null) {
-			if (other.quest != null)
-				return false;
-		} else if (!quest.equals(other.quest))
-			return false;
-		if (ref == null) {
-			if (other.ref != null)
-				return false;
-		} else if (!ref.equals(other.ref))
-			return false;
-		if (userInfo == null) {
-			if (other.userInfo != null)
-				return false;
-		} else if (!userInfo.equals(other.userInfo))
-			return false;
-		return true;
-	}
-
-	public int compareTo(final Object h) {
+     * @see java.lang.Object#hashCode()
+     */
+    @Override
+    public int hashCode() {
+        final int prime = 31;
+        int result = 1;
+        result = prime * result + ((host == null) ? 0 : host.hashCode());
+        result = prime * result + ((path == null) ? 0 : path.hashCode());
+        result = prime * result + port;
+        result = prime * result
+                + ((protocol == null) ? 0 : protocol.hashCode());
+        result = prime * result + ((quest == null) ? 0 : quest.hashCode());
+        result = prime * result + ((ref == null) ? 0 : ref.hashCode());
+        result = prime * result
+                + ((userInfo == null) ? 0 : userInfo.hashCode());
+        return result;
+    }
+
+    /* (non-Javadoc)
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (!(obj instanceof yacyURL))
+            return false;
+        yacyURL other = (yacyURL) obj;
+        if (host == null) {
+            if (other.host != null)
+                return false;
+        } else if (!host.equals(other.host))
+            return false;
+        if (path == null) {
+            if (other.path != null)
+                return false;
+        } else if (!path.equals(other.path))
+            return false;
+        if (port != other.port)
+            return false;
+        if (protocol == null) {
+            if (other.protocol != null)
+                return false;
+        } else if (!protocol.equals(other.protocol))
+            return false;
+        if (quest == null) {
+            if (other.quest != null)
+                return false;
+        } else if (!quest.equals(other.quest))
+            return false;
+        if (ref == null) {
+            if (other.ref != null)
+                return false;
+        } else if (!ref.equals(other.ref))
+            return false;
+        if (userInfo == null) {
+            if (other.userInfo != null)
+                return false;
+        } else if (!userInfo.equals(other.userInfo))
+            return false;
+        return true;
+    }
+
+    public int compareTo(final Object h) {
        assert (h instanceof yacyURL);
        return this.toString().compareTo(((yacyURL) h).toString());
    }
    
    public boolean isPOST() {
-    	return (this.quest != null) && (this.quest.length() > 0);
+        return (this.quest != null) && (this.quest.length() > 0);
    }

    public final boolean isCGI() {
@ -741,7 +798,7 @@ public class yacyURL implements Serializable {

        final int id = serverDomains.getDomainID(this.host); // id=7: tld is local
        final boolean isHTTP = this.protocol.equals("http");
-        int p = this.host.lastIndexOf('.');
+        int p = (host == null) ? -1 : this.host.lastIndexOf('.');
        String dom = (p > 0) ? dom = host.substring(0, p) : "";
        p = dom.lastIndexOf('.'); // locate subdomain
        String subdom = "";
@ -797,7 +854,7 @@ public class yacyURL implements Serializable {
    }

    private static final String hosthash5(final String protocol, final String host, final int port) {
-        return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5);
+        return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(protocol + ((host == null) ? "" : (":" + host + ":" + port)))).substring(0, 5);
    }
    
    /**
@ -902,7 +959,7 @@ public class yacyURL implements Serializable {
        if (this.hash == null) {
            if (this.host.startsWith("127.") || this.host.equals("localhost") || this.host.startsWith("0:0:0:0:0:0:0:1")) return true;
            synchronized (this) {
-            	if (this.hash == null) this.hash = urlHashComputation();
+                if (this.hash == null) this.hash = urlHashComputation();
            }
        }
        //if (domDomain(this.hash) != 7) System.out.println("*** DEBUG - not local: " + this.toNormalform(true, false));
@ -916,6 +973,7 @@ public class yacyURL implements Serializable {
    // language calculation
    public final String language() {
        String language = "en";
+        if (host == null) return language;
        final int pos = host.lastIndexOf(".");
        if (pos > 0 && host.length() - pos == 3) language = host.substring(pos + 1).toLowerCase();
        if (language.equals("uk")) language = "en";
@ -924,36 +982,42 @@ public class yacyURL implements Serializable {

    public static void main(final String[] args) {
        final String[][] test = new String[][]{
+          new String[]{null, "file://C:WINDOWS\\CMD0.EXE"},
+          new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
+          new String[]{null, "file:///bin/yacy2"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
+          new String[]{null, "file:C:WINDOWS\\CMD.EXE"},
+          new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"},
+          new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"},
          new String[]{null, "http://www.anomic.de/test/"},
          new String[]{null, "http://www.anomic.de/"},
          new String[]{null, "http://www.anomic.de"},
          new String[]{null, "http://www.anomic.de/home/test?x=1#home"},
          new String[]{null, "http://www.anomic.de/home/test?x=1"},
-	      new String[]{null, "http://www.anomic.de/home/test#home"},
-	      new String[]{null, "ftp://ftp.anomic.de/home/test#home"},
-	      new String[]{null, "http://www.anomic.de/home/../abc/"},
-	      new String[]{null, "mailto:abcdefg@nomailnomail.com"},
-	      new String[]{"http://www.anomic.de/home", "test"},
-	      new String[]{"http://www.anomic.de/home", "test/"},
-	      new String[]{"http://www.anomic.de/home/", "test"},
-	      new String[]{"http://www.anomic.de/home/", "test/"},
-	      new String[]{"http://www.anomic.de/home/index.html", "test.htm"},
-	      new String[]{"http://www.anomic.de/home/index.html", "http://www.yacy.net/test"},
-	      new String[]{"http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test"},
-	      new String[]{"http://www.anomic.de/home/index.html", "../test"},
-	      new String[]{"http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com"},
-	      new String[]{null, "news:de.test"},
-	      new String[]{"http://www.anomic.de/home", "news:de.test"},
-	      new String[]{null, "mailto:bob@web.com"},
-	      new String[]{"http://www.anomic.de/home", "mailto:bob@web.com"},
-	      new String[]{"http://www.anomic.de/home", "ftp://ftp.anomic.de/src"},
-	      new String[]{null, "ftp://ftp.delegate.org/"},
-	      new String[]{"http://www.anomic.de/home", "ftp://ftp.delegate.org/"},
-	      new String[]{"http://www.anomic.de","mailto:yacy@weltherrschaft.org"},
-	      new String[]{"http://www.anomic.de","javascipt:temp"},
-	      new String[]{null,"http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history"},
-	      new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"},
-	      new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&amp;showuser=23585"}
+          new String[]{null, "http://www.anomic.de/home/test#home"},
+          new String[]{null, "ftp://ftp.anomic.de/home/test#home"},
+          new String[]{null, "http://www.anomic.de/home/../abc/"},
+          new String[]{null, "mailto:abcdefg@nomailnomail.com"},
+          new String[]{"http://www.anomic.de/home", "test"},
+          new String[]{"http://www.anomic.de/home", "test/"},
+          new String[]{"http://www.anomic.de/home/", "test"},
+          new String[]{"http://www.anomic.de/home/", "test/"},
+          new String[]{"http://www.anomic.de/home/index.html", "test.htm"},
+          new String[]{"http://www.anomic.de/home/index.html", "http://www.yacy.net/test"},
+          new String[]{"http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test"},
+          new String[]{"http://www.anomic.de/home/index.html", "../test"},
+          new String[]{"http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com"},
+          new String[]{null, "news:de.test"},
+          new String[]{"http://www.anomic.de/home", "news:de.test"},
+          new String[]{null, "mailto:bob@web.com"},
+          new String[]{"http://www.anomic.de/home", "mailto:bob@web.com"},
+          new String[]{"http://www.anomic.de/home", "ftp://ftp.anomic.de/src"},
+          new String[]{null, "ftp://ftp.delegate.org/"},
+          new String[]{"http://www.anomic.de/home", "ftp://ftp.delegate.org/"},
+          new String[]{"http://www.anomic.de","mailto:yacy@weltherrschaft.org"},
+          new String[]{"http://www.anomic.de","javascipt:temp"},
+          new String[]{null,"http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history"},
+          new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"},
+          new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&amp;showuser=23585"}
          };
        String environment, url;
        yacyURL aURL, aURL1;
@ -961,7 +1025,7 @@ public class yacyURL implements Serializable {
        for (int i = 0; i < test.length; i++) {
            environment = test[i][0];
            url = test[i][1];
-            try {aURL = yacyURL.newURL(environment, url);} catch (final MalformedURLException e) {aURL = null;}
+            try {aURL = yacyURL.newURL(environment, url);} catch (final MalformedURLException e) {e.printStackTrace(); aURL = null;}
            if (aURL != null) System.out.println("normalized: " + aURL.toNormalform(true, true));
            if (environment == null) {
                try {jURL = new java.net.URL(url);} catch (final MalformedURLException e) {jURL = null;}