added a convenience class to add files into a YaCy index

to make this possible, the yacyURL must be able to process file:// urls, which has also been implemented
testing of the new class resulted in some bugfixes in other classes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6313 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 2e41e10ffd
commit 68465c37af

@ -27,6 +27,7 @@
package de.anomic.document.parser; package de.anomic.document.parser;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -80,9 +81,14 @@ public class swfParser extends AbstractParser implements Idiom {
contents = swf2html.convertSWFToHTML(source); contents = swf2html.convertSWFToHTML(source);
} catch (NegativeArraySizeException e) { } catch (NegativeArraySizeException e) {
// seen in log // seen in log
return null;
} catch (IOException e) {
e.printStackTrace();
return null;
} catch (Exception e) { } catch (Exception e) {
// we have seen a lot of OOM errors in the parser... // we have seen a lot of OOM errors in the parser...
e.printStackTrace(); e.printStackTrace();
return null;
} }
String url = null; String url = null;
String urlnr = null; String urlnr = null;

@ -283,6 +283,7 @@ public class SplitTable implements ObjectIndex {
private ObjectIndex checkTable(ObjectIndex table) { private ObjectIndex checkTable(ObjectIndex table) {
// check size and age of given table; in case it is too large or too old // check size and age of given table; in case it is too large or too old
// create a new table // create a new table
assert table != null;
String name = new File(table.filename()).getName(); String name = new File(table.filename()).getName();
long d; long d;
try { try {

@ -0,0 +1,192 @@
// DocumentIndex.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 14.09.2009 on http://yacy.net;
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-05-28 01:51:34 +0200 (Do, 28 Mai 2009) $
// $LastChangedRevision: 5988 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import de.anomic.document.Condenser;
import de.anomic.document.Document;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.search.QueryParams;
import de.anomic.search.RankingProfile;
import de.anomic.search.ResultEntry;
import de.anomic.search.SearchEvent;
import de.anomic.search.SearchEventCache;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
/**
* convenience class to access the yacycore library from outside of yacy to put files into the index
* @author Michael Christen
*
*/
public class DocumentIndex extends Segment {
private RankingProfile textRankingDefault = new RankingProfile(QueryParams.CONTENTDOM_TEXT);
//private Bitfield zeroConstraint = new Bitfield(4);
public DocumentIndex(Log log, final File segmentPath) throws IOException {
super(log, segmentPath, 100000, targetFileSize * 4 - 1, false, false);
}
public DocumentIndex(final File segmentPath) throws IOException {
this(new Log("DocumentIndex"), segmentPath);
}
/**
* put a single file into the index
* @param file
* @return a metadata object that has been generated to identify the file
* @throws IOException in case that the file does not exist or cannot be parsed
*/
public URLMetadataRow add(File file) throws IOException {
if (file == null) throw new IOException("file = null");
if (file.isDirectory()) throw new IOException("file should be a document, not a path");
if (!file.canRead()) throw new IOException("cannot read file");
yacyURL url = new yacyURL("file:" + file.getAbsolutePath());
Document document;
try {
document = Parser.parseSource(url, null, null, file);
} catch (InterruptedException e) {
throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage());
} catch (ParserException e) {
throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage());
}
final Condenser condenser = new Condenser(document, true, true);
return super.storeDocument(
url,
null,
new Date(file.lastModified()),
file.length(),
document,
condenser
);
}
/**
* add a file or a directory of files to the index
* If the given file is a path to a directory, the complete sub-tree is indexed
* @param start
*/
public void addAll(File start) {
assert (start != null);
assert (start.canRead());
if (!start.isDirectory()) {
try {
add(start);
} catch (IOException e) {
e.printStackTrace();
}
return;
}
String[] s = start.list();
File w;
for (String t: s) {
w = new File(start, t);
if (w.canRead() && ! w.isHidden()) {
if (w.isDirectory()) {
addAll(w);
} else {
try {
add(w);
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/**
* do a full-text search of a given string and return a specific number of results
* @param querystring
* @param pos
* @param count
* @return a list of files that contain the given string
*/
public File[] find(String querystring, int pos, int count) {
QueryParams query = new QueryParams(querystring, 100, textRankingDefault, null);
SearchEvent se = SearchEventCache.getEvent(query, this, null, null, null, false);
File[] result = new File[count];
ResultEntry re;
for (int i = 0; i < count; i++) {
re = se.oneResult(pos + i);
result[i] = (re == null) ? null : re.url().getLocalFile();
}
return result;
}
/**
* find the given string and return 20 hits
* @param querystring
* @return a list of files that contain the word
*/
public File[] find(String querystring) {
return find(querystring, 0, 20);
}
public static void main(String[] args) {
// first argument: path to segment
// second argument: either 'add' or 'search'
// third and more arguments exists only in case that second argument is 'search': these are then the search words
//
// example:
// DocumentIndex yacyindex add test/parsertest
// DocumentIndex yacyindex search steht
System.setProperty("java.awt.headless", "true");
if (args.length < 3) return;
File segmentPath = new File(args[0]);
System.out.println("using index files at " + segmentPath.getAbsolutePath());
try {
if (args[1].equals("add")) {
File f = new File(args[2]);
DocumentIndex di = new DocumentIndex(segmentPath);
di.addAll(f);
di.close();
} else {
String query = "";
for (int i = 2; i < args.length; i++) query += args[i];
query.trim();
DocumentIndex di = new DocumentIndex(segmentPath);
File[] results = di.find(query);
for (File f: results) {
if (f != null) System.out.println(f.toString());
}
di.close();
}
} catch (IOException e) {
e.printStackTrace();
}
//System.exit(0);
}
}

@ -82,7 +82,7 @@ public class IODispatcher extends Thread {
public synchronized void dump(ReferenceContainerCache<? extends Reference> cache, File file, ReferenceContainerArray<? extends Reference> array) { public synchronized void dump(ReferenceContainerCache<? extends Reference> cache, File file, ReferenceContainerArray<? extends Reference> array) {
if (dumpQueue == null || controlQueue == null || !this.isAlive()) { if (dumpQueue == null || controlQueue == null || !this.isAlive()) {
Log.logWarning("IODispatcher", "emergency dump of file " + file.getName()); Log.logWarning("IODispatcher", "emergency dump of file " + file.getName());
cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize)); if (cache.size() > 0) cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
} else { } else {
DumpJob<? extends Reference> job = (DumpJob<? extends Reference>)new DumpJob(cache, file, array); DumpJob<? extends Reference> job = (DumpJob<? extends Reference>)new DumpJob(cache, file, array);
try { try {
@ -204,7 +204,7 @@ public class IODispatcher extends Thread {
} }
public void dump() { public void dump() {
try { try {
cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize)); if (cache.size() > 0) cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
array.mountBLOBFile(file); array.mountBLOBFile(file);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();

@ -283,7 +283,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
* and is composed of the current date and the cell salt * and is composed of the current date and the cell salt
*/ */
public synchronized void close() { public synchronized void close() {
this.ram.dump(this.array.newContainerBLOBFile(), (int) Math.min(MemoryControl.available() / 3, writeBufferSize)); if (this.ram.size() > 0) this.ram.dump(this.array.newContainerBLOBFile(), (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
// close all // close all
this.ram.close(); this.ram.close();
this.array.close(); this.array.close();

@ -1,5 +1,5 @@
// Segment.java // Segment.java
// (C) 2005-209 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // (C) 2005-2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2005 on http://yacy.net; full redesign for segments 28.5.2009 // first published 2005 on http://yacy.net; full redesign for segments 28.5.2009
// //
// This is a part of YaCy, a peer-to-peer based web search engine // This is a part of YaCy, a peer-to-peer based web search engine
@ -45,7 +45,6 @@ import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.navigationPrototype.NavigationReference; import de.anomic.kelondro.text.navigationPrototype.NavigationReference;
import de.anomic.kelondro.text.navigationPrototype.NavigationReferenceFactory; import de.anomic.kelondro.text.navigationPrototype.NavigationReferenceFactory;
import de.anomic.kelondro.text.navigationPrototype.NavigationReferenceRow;
import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.text.referencePrototype.WordReferenceFactory; import de.anomic.kelondro.text.referencePrototype.WordReferenceFactory;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
@ -54,7 +53,7 @@ import de.anomic.tools.iso639;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log; import de.anomic.yacy.logging.Log;
public final class Segment { public class Segment {
// environment constants // environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
@ -70,7 +69,7 @@ public final class Segment {
private final Log log; private final Log log;
protected final IndexCell<WordReference> termIndex; protected final IndexCell<WordReference> termIndex;
private final IndexCell<NavigationReference> authorNavIndex; //private final IndexCell<NavigationReference> authorNavIndex;
protected final MetadataRepository urlMetadata; protected final MetadataRepository urlMetadata;
private final File segmentPath; private final File segmentPath;
private final IODispatcher merger; private final IODispatcher merger;
@ -100,7 +99,7 @@ public final class Segment {
maxFileSize, maxFileSize,
this.merger, this.merger,
writeBufferSize); writeBufferSize);
/*
this.authorNavIndex = new IndexCell<NavigationReference>( this.authorNavIndex = new IndexCell<NavigationReference>(
new File(new File(segmentPath, "nav_author"), "idx"), new File(new File(segmentPath, "nav_author"), "idx"),
navigationReferenceFactory, navigationReferenceFactory,
@ -111,7 +110,7 @@ public final class Segment {
maxFileSize, maxFileSize,
this.merger, this.merger,
writeBufferSize); writeBufferSize);
*/
File metadatadir = new File(segmentPath, "METADATA"); File metadatadir = new File(segmentPath, "METADATA");
if (!metadatadir.exists()) metadatadir.mkdirs(); if (!metadatadir.exists()) metadatadir.mkdirs();
@ -221,14 +220,14 @@ public final class Segment {
if (language == null) { if (language == null) {
// no statistics available, we take either the metadata (if given) or the TLD // no statistics available, we take either the metadata (if given) or the TLD
language = (bymetadata == null) ? url.language() : bymetadata; language = (bymetadata == null) ? url.language() : bymetadata;
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language); if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
} else { } else {
if (bymetadata == null) { if (bymetadata == null) {
// two possible results: compare and report conflicts // two possible results: compare and report conflicts
if (language.equals(url.language())) if (language.equals(url.language()))
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language); if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language);
else { else {
String error = "*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")"; String error = "LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")";
// see if we have a hint in the url that the statistic was right // see if we have a hint in the url that the statistic was right
String u = url.toNormalform(true, false).toLowerCase(); String u = url.toNormalform(true, false).toLowerCase();
if (!u.contains("/" + language + "/") && !u.contains("/" + iso639.country(language).toLowerCase() + "/")) { if (!u.contains("/" + language + "/") && !u.contains("/" + iso639.country(language).toLowerCase() + "/")) {
@ -243,14 +242,14 @@ public final class Segment {
} else { } else {
// here we have three results: we can do a voting // here we have three results: we can do a voting
if (language.equals(bymetadata)) { if (language.equals(bymetadata)) {
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language); //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
} else if (language.equals(url.language())) { } else if (language.equals(url.language())) {
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language); //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
} else if (bymetadata.equals(url.language())) { } else if (bymetadata.equals(url.language())) {
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")"); //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
language = bymetadata; language = bymetadata;
} else { } else {
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata."); //if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
language = bymetadata; language = bymetadata;
} }
} }

@ -69,7 +69,7 @@ public class ResultEntry {
this.dbRetrievalTime = dbRetrievalTime; this.dbRetrievalTime = dbRetrievalTime;
this.snippetComputationTime = snippetComputationTime; this.snippetComputationTime = snippetComputationTime;
final String host = urlcomps.url().getHost(); final String host = urlcomps.url().getHost();
if (host.endsWith(".yacyh")) { if (host != null && host.endsWith(".yacyh")) {
// translate host into current IP // translate host into current IP
int p = host.indexOf("."); int p = host.indexOf(".");
final String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6)); final String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6));

@ -75,7 +75,7 @@ public class SearchEventCache {
String id = query.id(false); String id = query.id(false);
SearchEvent event = SearchEventCache.lastEvents.get(id); SearchEvent event = SearchEventCache.lastEvents.get(id);
if (Switchboard.getSwitchboard().crawlQueues.noticeURL.size() > 0 && event != null && System.currentTimeMillis() - event.getEventTime() > 60000) { if (Switchboard.getSwitchboard() != null && Switchboard.getSwitchboard().crawlQueues.noticeURL.size() > 0 && event != null && System.currentTimeMillis() - event.getEventTime() > 60000) {
// if a local crawl is ongoing, don't use the result from the cache to use possibly more results that come from the current crawl // if a local crawl is ongoing, don't use the result from the cache to use possibly more results that come from the current crawl
// to prevent that this happens during a person switches between the different result pages, a re-search happens no more than // to prevent that this happens during a person switches between the different result pages, a re-search happens no more than
// once a minute // once a minute

@ -208,9 +208,9 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
public static long lastPPMUpdate = System.currentTimeMillis()- 30000; public static long lastPPMUpdate = System.currentTimeMillis()- 30000;
// colored list management // colored list management
public static TreeSet<String> badwords = null; public static TreeSet<String> badwords = new TreeSet<String>();
public static TreeSet<String> stopwords = new TreeSet<String>();
public static TreeSet<String> blueList = null; public static TreeSet<String> blueList = null;
public static TreeSet<String> stopwords = null;
public static TreeSet<byte[]> badwordHashes = null; public static TreeSet<byte[]> badwordHashes = null;
public static TreeSet<byte[]> blueListHashes = null; public static TreeSet<byte[]> blueListHashes = null;
public static TreeSet<byte[]> stopwordHashes = null; public static TreeSet<byte[]> stopwordHashes = null;

@ -530,6 +530,7 @@ public class serverDomains {
} }
public static int getDomainID(final String host) { public static int getDomainID(final String host) {
if (host == null) return TLD_Local_ID;
final int p = host.lastIndexOf('.'); final int p = host.lastIndexOf('.');
String tld = ""; String tld = "";
if (p > 0) { if (p > 0) {

@ -223,6 +223,7 @@ public class Punycode
// the following method has been added by Michael Christen // the following method has been added by Michael Christen
public static boolean isBasic(final String input) { public static boolean isBasic(final String input) {
if (input == null) return true;
for (int j = 0; j < input.length(); j++) { for (int j = 0; j < input.length(); j++) {
if (!isBasic(input.charAt(j))) return false; if (!isBasic(input.charAt(j))) return false;
} }

@ -67,8 +67,12 @@ public class yacyURL implements Serializable {
return (url == null) ? null : url.hash().substring(6); return (url == null) ? null : url.hash().substring(6);
} }
public yacyURL(final File file) throws MalformedURLException {
this("file", "", -1, file.getAbsolutePath());
}
public yacyURL(final String url) throws MalformedURLException { public yacyURL(final String url) throws MalformedURLException {
this(url, null); this(url, null);
} }
public yacyURL(final String url, final String hash) throws MalformedURLException { public yacyURL(final String url, final String hash) throws MalformedURLException {
@ -89,7 +93,7 @@ public class yacyURL implements Serializable {
} }
this.protocol = url.substring(0, p).toLowerCase().trim(); this.protocol = url.substring(0, p).toLowerCase().trim();
if (url.length() < p + 4) throw new MalformedURLException("URL not parseable: '" + url + "'"); if (url.length() < p + 4) throw new MalformedURLException("URL not parseable: '" + url + "'");
if (url.substring(p + 1, p + 3).equals("//")) { if (!this.protocol.equals("file") && url.substring(p + 1, p + 3).equals("//")) {
// identify host, userInfo and file for http and ftp protocol // identify host, userInfo and file for http and ftp protocol
final int q = url.indexOf('/', p + 3); final int q = url.indexOf('/', p + 3);
int r; int r;
@ -112,7 +116,7 @@ public class yacyURL implements Serializable {
} }
path = url.substring(q); path = url.substring(q);
} }
if (host.length() < 4) throw new MalformedURLException("host too short: '" + host + "'"); if (host.length() < 4 && !protocol.equals("file")) throw new MalformedURLException("host too short: '" + host + "'");
if (host.indexOf('&') >= 0) throw new MalformedURLException("invalid '&' in host"); if (host.indexOf('&') >= 0) throw new MalformedURLException("invalid '&' in host");
path = resolveBackpath(path); path = resolveBackpath(path);
identPort(url, (protocol.equals("http") ? 80 : ((protocol.equals("https")) ? 443 : ((protocol.equals("ftp")) ? 21 : -1)))); identPort(url, (protocol.equals("http") ? 80 : ((protocol.equals("https")) ? 443 : ((protocol.equals("ftp")) ? 21 : -1))));
@ -133,6 +137,46 @@ public class yacyURL implements Serializable {
port = -1; port = -1;
quest = null; quest = null;
ref = null; ref = null;
} if (protocol.equals("file")) {
// parse file url
String h = url.substring(p + 1);
if (h.startsWith("//")) {
// host may be given, but may be also empty
final int q = h.indexOf('/', 2);
if (q <= 0) {
// no host given
host = null;
path = h.substring(2);
} else {
host = h.substring(2, q);
if (host.length() == 0 || host.equals("localhost")) host = null;
h = h.substring(q);
char c = h.charAt(2);
if (c == ':' || c == '|')
path = h.substring(1);
else
path = h;
}
} else {
host = null;
if (h.startsWith("/")) {
char c = h.charAt(2);
if (c == ':' || c == '|')
path = h.substring(1);
else
path = h;
} else {
char c = h.charAt(1);
if (c == ':' || c == '|')
path = h;
else
path = "/" + h;
}
}
userInfo = null;
port = -1;
quest = null;
ref = null;
} else { } else {
throw new MalformedURLException("unknown protocol: " + url); throw new MalformedURLException("unknown protocol: " + url);
} }
@ -144,24 +188,20 @@ public class yacyURL implements Serializable {
StringBuilder buffer = new StringBuilder(); StringBuilder buffer = new StringBuilder();
// encode each domainpart seperately // encode each domainpart seperately
for(int i=0; i<domainParts.length; i++) { for(int i=0; i<domainParts.length; i++) {
final String part = domainParts[i]; final String part = domainParts[i];
if(!Punycode.isBasic(part)) { if(!Punycode.isBasic(part)) {
buffer.append("xn--" + Punycode.encode(part)); buffer.append("xn--" + Punycode.encode(part));
} else { } else {
buffer.append(part); buffer.append(part);
} }
if(i != domainParts.length-1) { if(i != domainParts.length-1) {
buffer.append('.'); buffer.append('.');
} }
} }
host = buffer.toString(); host = buffer.toString();
} catch (final PunycodeException e) {} } catch (final PunycodeException e) {}
} }
public yacyURL(final File file) throws MalformedURLException {
this("file", "", -1, file.getAbsolutePath());
}
public static yacyURL newURL(final String baseURL, final String relPath) throws MalformedURLException { public static yacyURL newURL(final String baseURL, final String relPath) throws MalformedURLException {
if ((baseURL == null) || if ((baseURL == null) ||
(relPath.startsWith("http://")) || (relPath.startsWith("http://")) ||
@ -212,8 +252,8 @@ public class yacyURL implements Serializable {
(relPath.startsWith("smb://"))) { (relPath.startsWith("smb://"))) {
this.path = baseURL.path; this.path = baseURL.path;
} else if (relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()) { // discards also any unknown protocol from previous if } else if (relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()) { // discards also any unknown protocol from previous if
throw new MalformedURLException("relative path malformed: " + relPath); throw new MalformedURLException("relative path malformed: " + relPath);
} else if (relPath.startsWith("/")) { } else if (relPath.startsWith("/")) {
this.path = relPath; this.path = relPath;
} else if (baseURL.path.endsWith("/")) { } else if (baseURL.path.endsWith("/")) {
if (relPath.startsWith("#") || relPath.startsWith("?")) { if (relPath.startsWith("#") || relPath.startsWith("?")) {
@ -315,7 +355,7 @@ public class yacyURL implements Serializable {
qtmp.append('='); qtmp.append('=');
qtmp.append(escape(questp[i].substring(questp[i].indexOf('=') + 1))); qtmp.append(escape(questp[i].substring(questp[i].indexOf('=') + 1)));
} else { } else {
qtmp.append('&'); qtmp.append('&');
qtmp.append(escape(questp[i])); qtmp.append(escape(questp[i]));
} }
} }
@ -541,6 +581,20 @@ public class yacyURL implements Serializable {
return path; return path;
} }
/**
* return the file object to a local file
* this patches also 'strange' windows file paths
* @return the file as absolute path
*/
public File getLocalFile() {
char c = path.charAt(1);
if (c == ':') return new File(path.replace('/', '\\'));
if (c == '|') return new File(path.charAt(0) + ":" + path.substring(2).replace('/', '\\'));
c = path.charAt(2);
if (c == ':' || c == '|') return new File(path.charAt(1) + ":" + path.substring(3).replace('/', '\\'));
return new File(path);
}
public String getAuthority() { public String getAuthority() {
return ((port >= 0) && (host != null)) ? host + ":" + port : ((host != null) ? host : ""); return ((port >= 0) && (host != null)) ? host + ":" + port : ((host != null) ? host : "");
} }
@ -562,7 +616,7 @@ public class yacyURL implements Serializable {
} }
public void removeRef() { public void removeRef() {
ref = null; ref = null;
} }
public String getUserInfo() { public String getUserInfo() {
@ -596,13 +650,16 @@ public class yacyURL implements Serializable {
if (this.port < 0 || this.port == 21) { defaultPort = true; } if (this.port < 0 || this.port == 21) { defaultPort = true; }
} else if (this.protocol.equals("https")) { } else if (this.protocol.equals("https")) {
if (this.port < 0 || this.port == 443) { defaultPort = true; } if (this.port < 0 || this.port == 443) { defaultPort = true; }
} else if (this.protocol.equals("file")) {
defaultPort = true;
} }
final String path = this.getFile(includeReference); final String path = this.getFile(includeReference);
if (defaultPort) { if (defaultPort) {
return this.protocol + "://" + return
((this.userInfo != null) ? (this.userInfo + "@") : ("")) + this.protocol + ":" +
this.getHost().toLowerCase() + path; ((this.getHost() == null) ? "" : "//" + ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + this.getHost().toLowerCase()) +
path;
} }
return this.protocol + "://" + return this.protocol + "://" +
((this.userInfo != null) ? (this.userInfo + "@") : ("")) + ((this.userInfo != null) ? (this.userInfo + "@") : ("")) +
@ -610,78 +667,78 @@ public class yacyURL implements Serializable {
} }
/* (non-Javadoc) /* (non-Javadoc)
* @see java.lang.Object#hashCode() * @see java.lang.Object#hashCode()
*/ */
@Override @Override
public int hashCode() { public int hashCode() {
final int prime = 31; final int prime = 31;
int result = 1; int result = 1;
result = prime * result + ((host == null) ? 0 : host.hashCode()); result = prime * result + ((host == null) ? 0 : host.hashCode());
result = prime * result + ((path == null) ? 0 : path.hashCode()); result = prime * result + ((path == null) ? 0 : path.hashCode());
result = prime * result + port; result = prime * result + port;
result = prime * result result = prime * result
+ ((protocol == null) ? 0 : protocol.hashCode()); + ((protocol == null) ? 0 : protocol.hashCode());
result = prime * result + ((quest == null) ? 0 : quest.hashCode()); result = prime * result + ((quest == null) ? 0 : quest.hashCode());
result = prime * result + ((ref == null) ? 0 : ref.hashCode()); result = prime * result + ((ref == null) ? 0 : ref.hashCode());
result = prime * result result = prime * result
+ ((userInfo == null) ? 0 : userInfo.hashCode()); + ((userInfo == null) ? 0 : userInfo.hashCode());
return result; return result;
} }
/* (non-Javadoc) /* (non-Javadoc)
* @see java.lang.Object#equals(java.lang.Object) * @see java.lang.Object#equals(java.lang.Object)
*/ */
@Override @Override
public boolean equals(Object obj) { public boolean equals(Object obj) {
if (this == obj) if (this == obj)
return true; return true;
if (obj == null) if (obj == null)
return false; return false;
if (!(obj instanceof yacyURL)) if (!(obj instanceof yacyURL))
return false; return false;
yacyURL other = (yacyURL) obj; yacyURL other = (yacyURL) obj;
if (host == null) { if (host == null) {
if (other.host != null) if (other.host != null)
return false; return false;
} else if (!host.equals(other.host)) } else if (!host.equals(other.host))
return false; return false;
if (path == null) { if (path == null) {
if (other.path != null) if (other.path != null)
return false; return false;
} else if (!path.equals(other.path)) } else if (!path.equals(other.path))
return false; return false;
if (port != other.port) if (port != other.port)
return false; return false;
if (protocol == null) { if (protocol == null) {
if (other.protocol != null) if (other.protocol != null)
return false; return false;
} else if (!protocol.equals(other.protocol)) } else if (!protocol.equals(other.protocol))
return false; return false;
if (quest == null) { if (quest == null) {
if (other.quest != null) if (other.quest != null)
return false; return false;
} else if (!quest.equals(other.quest)) } else if (!quest.equals(other.quest))
return false; return false;
if (ref == null) { if (ref == null) {
if (other.ref != null) if (other.ref != null)
return false; return false;
} else if (!ref.equals(other.ref)) } else if (!ref.equals(other.ref))
return false; return false;
if (userInfo == null) { if (userInfo == null) {
if (other.userInfo != null) if (other.userInfo != null)
return false; return false;
} else if (!userInfo.equals(other.userInfo)) } else if (!userInfo.equals(other.userInfo))
return false; return false;
return true; return true;
} }
public int compareTo(final Object h) { public int compareTo(final Object h) {
assert (h instanceof yacyURL); assert (h instanceof yacyURL);
return this.toString().compareTo(((yacyURL) h).toString()); return this.toString().compareTo(((yacyURL) h).toString());
} }
public boolean isPOST() { public boolean isPOST() {
return (this.quest != null) && (this.quest.length() > 0); return (this.quest != null) && (this.quest.length() > 0);
} }
public final boolean isCGI() { public final boolean isCGI() {
@ -741,7 +798,7 @@ public class yacyURL implements Serializable {
final int id = serverDomains.getDomainID(this.host); // id=7: tld is local final int id = serverDomains.getDomainID(this.host); // id=7: tld is local
final boolean isHTTP = this.protocol.equals("http"); final boolean isHTTP = this.protocol.equals("http");
int p = this.host.lastIndexOf('.'); int p = (host == null) ? -1 : this.host.lastIndexOf('.');
String dom = (p > 0) ? dom = host.substring(0, p) : ""; String dom = (p > 0) ? dom = host.substring(0, p) : "";
p = dom.lastIndexOf('.'); // locate subdomain p = dom.lastIndexOf('.'); // locate subdomain
String subdom = ""; String subdom = "";
@ -797,7 +854,7 @@ public class yacyURL implements Serializable {
} }
private static final String hosthash5(final String protocol, final String host, final int port) { private static final String hosthash5(final String protocol, final String host, final int port) {
return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5); return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(protocol + ((host == null) ? "" : (":" + host + ":" + port)))).substring(0, 5);
} }
/** /**
@ -902,7 +959,7 @@ public class yacyURL implements Serializable {
if (this.hash == null) { if (this.hash == null) {
if (this.host.startsWith("127.") || this.host.equals("localhost") || this.host.startsWith("0:0:0:0:0:0:0:1")) return true; if (this.host.startsWith("127.") || this.host.equals("localhost") || this.host.startsWith("0:0:0:0:0:0:0:1")) return true;
synchronized (this) { synchronized (this) {
if (this.hash == null) this.hash = urlHashComputation(); if (this.hash == null) this.hash = urlHashComputation();
} }
} }
//if (domDomain(this.hash) != 7) System.out.println("*** DEBUG - not local: " + this.toNormalform(true, false)); //if (domDomain(this.hash) != 7) System.out.println("*** DEBUG - not local: " + this.toNormalform(true, false));
@ -916,6 +973,7 @@ public class yacyURL implements Serializable {
// language calculation // language calculation
public final String language() { public final String language() {
String language = "en"; String language = "en";
if (host == null) return language;
final int pos = host.lastIndexOf("."); final int pos = host.lastIndexOf(".");
if (pos > 0 && host.length() - pos == 3) language = host.substring(pos + 1).toLowerCase(); if (pos > 0 && host.length() - pos == 3) language = host.substring(pos + 1).toLowerCase();
if (language.equals("uk")) language = "en"; if (language.equals("uk")) language = "en";
@ -924,36 +982,42 @@ public class yacyURL implements Serializable {
public static void main(final String[] args) { public static void main(final String[] args) {
final String[][] test = new String[][]{ final String[][] test = new String[][]{
new String[]{null, "file://C:WINDOWS\\CMD0.EXE"},
new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
new String[]{null, "file:///bin/yacy2"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
new String[]{null, "file:C:WINDOWS\\CMD.EXE"},
new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"},
new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"},
new String[]{null, "http://www.anomic.de/test/"}, new String[]{null, "http://www.anomic.de/test/"},
new String[]{null, "http://www.anomic.de/"}, new String[]{null, "http://www.anomic.de/"},
new String[]{null, "http://www.anomic.de"}, new String[]{null, "http://www.anomic.de"},
new String[]{null, "http://www.anomic.de/home/test?x=1#home"}, new String[]{null, "http://www.anomic.de/home/test?x=1#home"},
new String[]{null, "http://www.anomic.de/home/test?x=1"}, new String[]{null, "http://www.anomic.de/home/test?x=1"},
new String[]{null, "http://www.anomic.de/home/test#home"}, new String[]{null, "http://www.anomic.de/home/test#home"},
new String[]{null, "ftp://ftp.anomic.de/home/test#home"}, new String[]{null, "ftp://ftp.anomic.de/home/test#home"},
new String[]{null, "http://www.anomic.de/home/../abc/"}, new String[]{null, "http://www.anomic.de/home/../abc/"},
new String[]{null, "mailto:abcdefg@nomailnomail.com"}, new String[]{null, "mailto:abcdefg@nomailnomail.com"},
new String[]{"http://www.anomic.de/home", "test"}, new String[]{"http://www.anomic.de/home", "test"},
new String[]{"http://www.anomic.de/home", "test/"}, new String[]{"http://www.anomic.de/home", "test/"},
new String[]{"http://www.anomic.de/home/", "test"}, new String[]{"http://www.anomic.de/home/", "test"},
new String[]{"http://www.anomic.de/home/", "test/"}, new String[]{"http://www.anomic.de/home/", "test/"},
new String[]{"http://www.anomic.de/home/index.html", "test.htm"}, new String[]{"http://www.anomic.de/home/index.html", "test.htm"},
new String[]{"http://www.anomic.de/home/index.html", "http://www.yacy.net/test"}, new String[]{"http://www.anomic.de/home/index.html", "http://www.yacy.net/test"},
new String[]{"http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test"}, new String[]{"http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test"},
new String[]{"http://www.anomic.de/home/index.html", "../test"}, new String[]{"http://www.anomic.de/home/index.html", "../test"},
new String[]{"http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com"}, new String[]{"http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com"},
new String[]{null, "news:de.test"}, new String[]{null, "news:de.test"},
new String[]{"http://www.anomic.de/home", "news:de.test"}, new String[]{"http://www.anomic.de/home", "news:de.test"},
new String[]{null, "mailto:bob@web.com"}, new String[]{null, "mailto:bob@web.com"},
new String[]{"http://www.anomic.de/home", "mailto:bob@web.com"}, new String[]{"http://www.anomic.de/home", "mailto:bob@web.com"},
new String[]{"http://www.anomic.de/home", "ftp://ftp.anomic.de/src"}, new String[]{"http://www.anomic.de/home", "ftp://ftp.anomic.de/src"},
new String[]{null, "ftp://ftp.delegate.org/"}, new String[]{null, "ftp://ftp.delegate.org/"},
new String[]{"http://www.anomic.de/home", "ftp://ftp.delegate.org/"}, new String[]{"http://www.anomic.de/home", "ftp://ftp.delegate.org/"},
new String[]{"http://www.anomic.de","mailto:yacy@weltherrschaft.org"}, new String[]{"http://www.anomic.de","mailto:yacy@weltherrschaft.org"},
new String[]{"http://www.anomic.de","javascipt:temp"}, new String[]{"http://www.anomic.de","javascipt:temp"},
new String[]{null,"http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history"}, new String[]{null,"http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history"},
new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"}, new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"},
new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&amp;showuser=23585"} new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&amp;showuser=23585"}
}; };
String environment, url; String environment, url;
yacyURL aURL, aURL1; yacyURL aURL, aURL1;
@ -961,7 +1025,7 @@ public class yacyURL implements Serializable {
for (int i = 0; i < test.length; i++) { for (int i = 0; i < test.length; i++) {
environment = test[i][0]; environment = test[i][0];
url = test[i][1]; url = test[i][1];
try {aURL = yacyURL.newURL(environment, url);} catch (final MalformedURLException e) {aURL = null;} try {aURL = yacyURL.newURL(environment, url);} catch (final MalformedURLException e) {e.printStackTrace(); aURL = null;}
if (aURL != null) System.out.println("normalized: " + aURL.toNormalform(true, true)); if (aURL != null) System.out.println("normalized: " + aURL.toNormalform(true, true));
if (environment == null) { if (environment == null) {
try {jURL = new java.net.URL(url);} catch (final MalformedURLException e) {jURL = null;} try {jURL = new java.net.URL(url);} catch (final MalformedURLException e) {jURL = null;}

Loading…
Cancel
Save