added a convenience class to add files into a YaCy index

to make this possible, the yacyURL must be able to process file:// urls, which has also been implemented
testing of the new class resulted in some bugfixes in other classes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6313 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 2e41e10ffd
commit 68465c37af

@ -27,6 +27,7 @@
package de.anomic.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
@ -80,9 +81,14 @@ public class swfParser extends AbstractParser implements Idiom {
contents = swf2html.convertSWFToHTML(source);
} catch (NegativeArraySizeException e) {
// seen in log
return null;
} catch (IOException e) {
e.printStackTrace();
return null;
} catch (Exception e) {
// we have seen a lot of OOM errors in the parser...
e.printStackTrace();
return null;
}
String url = null;
String urlnr = null;

@ -283,6 +283,7 @@ public class SplitTable implements ObjectIndex {
private ObjectIndex checkTable(ObjectIndex table) {
// check size and age of given table; in case it is too large or too old
// create a new table
assert table != null;
String name = new File(table.filename()).getName();
long d;
try {

@ -0,0 +1,192 @@
// DocumentIndex.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 14.09.2009 on http://yacy.net;
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-05-28 01:51:34 +0200 (Do, 28 Mai 2009) $
// $LastChangedRevision: 5988 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import de.anomic.document.Condenser;
import de.anomic.document.Document;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.search.QueryParams;
import de.anomic.search.RankingProfile;
import de.anomic.search.ResultEntry;
import de.anomic.search.SearchEvent;
import de.anomic.search.SearchEventCache;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
/**
* convenience class to access the yacycore library from outside of yacy to put files into the index
* @author Michael Christen
*
*/
public class DocumentIndex extends Segment {
private RankingProfile textRankingDefault = new RankingProfile(QueryParams.CONTENTDOM_TEXT);
//private Bitfield zeroConstraint = new Bitfield(4);
public DocumentIndex(Log log, final File segmentPath) throws IOException {
super(log, segmentPath, 100000, targetFileSize * 4 - 1, false, false);
}
public DocumentIndex(final File segmentPath) throws IOException {
this(new Log("DocumentIndex"), segmentPath);
}
/**
* put a single file into the index
* @param file
* @return a metadata object that has been generated to identify the file
* @throws IOException in case that the file does not exist or cannot be parsed
*/
public URLMetadataRow add(File file) throws IOException {
if (file == null) throw new IOException("file = null");
if (file.isDirectory()) throw new IOException("file should be a document, not a path");
if (!file.canRead()) throw new IOException("cannot read file");
yacyURL url = new yacyURL("file:" + file.getAbsolutePath());
Document document;
try {
document = Parser.parseSource(url, null, null, file);
} catch (InterruptedException e) {
throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage());
} catch (ParserException e) {
throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage());
}
final Condenser condenser = new Condenser(document, true, true);
return super.storeDocument(
url,
null,
new Date(file.lastModified()),
file.length(),
document,
condenser
);
}
/**
* add a file or a directory of files to the index
* If the given file is a path to a directory, the complete sub-tree is indexed
* @param start
*/
public void addAll(File start) {
assert (start != null);
assert (start.canRead());
if (!start.isDirectory()) {
try {
add(start);
} catch (IOException e) {
e.printStackTrace();
}
return;
}
String[] s = start.list();
File w;
for (String t: s) {
w = new File(start, t);
if (w.canRead() && ! w.isHidden()) {
if (w.isDirectory()) {
addAll(w);
} else {
try {
add(w);
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/**
* do a full-text search of a given string and return a specific number of results
* @param querystring
* @param pos
* @param count
* @return a list of files that contain the given string
*/
public File[] find(String querystring, int pos, int count) {
QueryParams query = new QueryParams(querystring, 100, textRankingDefault, null);
SearchEvent se = SearchEventCache.getEvent(query, this, null, null, null, false);
File[] result = new File[count];
ResultEntry re;
for (int i = 0; i < count; i++) {
re = se.oneResult(pos + i);
result[i] = (re == null) ? null : re.url().getLocalFile();
}
return result;
}
/**
* find the given string and return 20 hits
* @param querystring
* @return a list of files that contain the word
*/
public File[] find(String querystring) {
return find(querystring, 0, 20);
}
public static void main(String[] args) {
// first argument: path to segment
// second argument: either 'add' or 'search'
// third and more arguments exists only in case that second argument is 'search': these are then the search words
//
// example:
// DocumentIndex yacyindex add test/parsertest
// DocumentIndex yacyindex search steht
System.setProperty("java.awt.headless", "true");
if (args.length < 3) return;
File segmentPath = new File(args[0]);
System.out.println("using index files at " + segmentPath.getAbsolutePath());
try {
if (args[1].equals("add")) {
File f = new File(args[2]);
DocumentIndex di = new DocumentIndex(segmentPath);
di.addAll(f);
di.close();
} else {
String query = "";
for (int i = 2; i < args.length; i++) query += args[i];
query.trim();
DocumentIndex di = new DocumentIndex(segmentPath);
File[] results = di.find(query);
for (File f: results) {
if (f != null) System.out.println(f.toString());
}
di.close();
}
} catch (IOException e) {
e.printStackTrace();
}
//System.exit(0);
}
}

@ -82,7 +82,7 @@ public class IODispatcher extends Thread {
public synchronized void dump(ReferenceContainerCache<? extends Reference> cache, File file, ReferenceContainerArray<? extends Reference> array) {
if (dumpQueue == null || controlQueue == null || !this.isAlive()) {
Log.logWarning("IODispatcher", "emergency dump of file " + file.getName());
cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
if (cache.size() > 0) cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
} else {
DumpJob<? extends Reference> job = (DumpJob<? extends Reference>)new DumpJob(cache, file, array);
try {
@ -204,7 +204,7 @@ public class IODispatcher extends Thread {
}
public void dump() {
try {
cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
if (cache.size() > 0) cache.dump(file, (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
array.mountBLOBFile(file);
} catch (IOException e) {
e.printStackTrace();

@ -283,7 +283,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
* and is composed of the current date and the cell salt
*/
public synchronized void close() {
this.ram.dump(this.array.newContainerBLOBFile(), (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
if (this.ram.size() > 0) this.ram.dump(this.array.newContainerBLOBFile(), (int) Math.min(MemoryControl.available() / 3, writeBufferSize));
// close all
this.ram.close();
this.array.close();

@ -1,5 +1,5 @@
// Segment.java
// (C) 2005-209 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// (C) 2005-2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2005 on http://yacy.net; full redesign for segments 28.5.2009
//
// This is a part of YaCy, a peer-to-peer based web search engine
@ -45,7 +45,6 @@ import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.navigationPrototype.NavigationReference;
import de.anomic.kelondro.text.navigationPrototype.NavigationReferenceFactory;
import de.anomic.kelondro.text.navigationPrototype.NavigationReferenceRow;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.text.referencePrototype.WordReferenceFactory;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
@ -54,7 +53,7 @@ import de.anomic.tools.iso639;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
public final class Segment {
public class Segment {
// environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
@ -70,7 +69,7 @@ public final class Segment {
private final Log log;
protected final IndexCell<WordReference> termIndex;
private final IndexCell<NavigationReference> authorNavIndex;
//private final IndexCell<NavigationReference> authorNavIndex;
protected final MetadataRepository urlMetadata;
private final File segmentPath;
private final IODispatcher merger;
@ -100,7 +99,7 @@ public final class Segment {
maxFileSize,
this.merger,
writeBufferSize);
/*
this.authorNavIndex = new IndexCell<NavigationReference>(
new File(new File(segmentPath, "nav_author"), "idx"),
navigationReferenceFactory,
@ -111,7 +110,7 @@ public final class Segment {
maxFileSize,
this.merger,
writeBufferSize);
*/
File metadatadir = new File(segmentPath, "METADATA");
if (!metadatadir.exists()) metadatadir.mkdirs();
@ -221,14 +220,14 @@ public final class Segment {
if (language == null) {
// no statistics available, we take either the metadata (if given) or the TLD
language = (bymetadata == null) ? url.language() : bymetadata;
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
} else {
if (bymetadata == null) {
// two possible results: compare and report conflicts
if (language.equals(url.language()))
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language);
if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language);
else {
String error = "*** DEBUG LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")";
String error = "LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")";
// see if we have a hint in the url that the statistic was right
String u = url.toNormalform(true, false).toLowerCase();
if (!u.contains("/" + language + "/") && !u.contains("/" + iso639.country(language).toLowerCase() + "/")) {
@ -243,14 +242,14 @@ public final class Segment {
} else {
// here we have three results: we can do a voting
if (language.equals(bymetadata)) {
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
//if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
} else if (language.equals(url.language())) {
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
//if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
} else if (bymetadata.equals(url.language())) {
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
//if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
language = bymetadata;
} else {
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
//if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
language = bymetadata;
}
}

@ -69,7 +69,7 @@ public class ResultEntry {
this.dbRetrievalTime = dbRetrievalTime;
this.snippetComputationTime = snippetComputationTime;
final String host = urlcomps.url().getHost();
if (host.endsWith(".yacyh")) {
if (host != null && host.endsWith(".yacyh")) {
// translate host into current IP
int p = host.indexOf(".");
final String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6));

@ -75,7 +75,7 @@ public class SearchEventCache {
String id = query.id(false);
SearchEvent event = SearchEventCache.lastEvents.get(id);
if (Switchboard.getSwitchboard().crawlQueues.noticeURL.size() > 0 && event != null && System.currentTimeMillis() - event.getEventTime() > 60000) {
if (Switchboard.getSwitchboard() != null && Switchboard.getSwitchboard().crawlQueues.noticeURL.size() > 0 && event != null && System.currentTimeMillis() - event.getEventTime() > 60000) {
// if a local crawl is ongoing, don't use the result from the cache to use possibly more results that come from the current crawl
// to prevent that this happens during a person switches between the different result pages, a re-search happens no more than
// once a minute

@ -208,9 +208,9 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
public static long lastPPMUpdate = System.currentTimeMillis()- 30000;
// colored list management
public static TreeSet<String> badwords = null;
public static TreeSet<String> badwords = new TreeSet<String>();
public static TreeSet<String> stopwords = new TreeSet<String>();
public static TreeSet<String> blueList = null;
public static TreeSet<String> stopwords = null;
public static TreeSet<byte[]> badwordHashes = null;
public static TreeSet<byte[]> blueListHashes = null;
public static TreeSet<byte[]> stopwordHashes = null;

@ -530,6 +530,7 @@ public class serverDomains {
}
public static int getDomainID(final String host) {
if (host == null) return TLD_Local_ID;
final int p = host.lastIndexOf('.');
String tld = "";
if (p > 0) {

@ -223,6 +223,7 @@ public class Punycode
// the following method has been added by Michael Christen
public static boolean isBasic(final String input) {
if (input == null) return true;
for (int j = 0; j < input.length(); j++) {
if (!isBasic(input.charAt(j))) return false;
}

@ -67,6 +67,10 @@ public class yacyURL implements Serializable {
return (url == null) ? null : url.hash().substring(6);
}
public yacyURL(final File file) throws MalformedURLException {
this("file", "", -1, file.getAbsolutePath());
}
public yacyURL(final String url) throws MalformedURLException {
this(url, null);
}
@ -89,7 +93,7 @@ public class yacyURL implements Serializable {
}
this.protocol = url.substring(0, p).toLowerCase().trim();
if (url.length() < p + 4) throw new MalformedURLException("URL not parseable: '" + url + "'");
if (url.substring(p + 1, p + 3).equals("//")) {
if (!this.protocol.equals("file") && url.substring(p + 1, p + 3).equals("//")) {
// identify host, userInfo and file for http and ftp protocol
final int q = url.indexOf('/', p + 3);
int r;
@ -112,7 +116,7 @@ public class yacyURL implements Serializable {
}
path = url.substring(q);
}
if (host.length() < 4) throw new MalformedURLException("host too short: '" + host + "'");
if (host.length() < 4 && !protocol.equals("file")) throw new MalformedURLException("host too short: '" + host + "'");
if (host.indexOf('&') >= 0) throw new MalformedURLException("invalid '&' in host");
path = resolveBackpath(path);
identPort(url, (protocol.equals("http") ? 80 : ((protocol.equals("https")) ? 443 : ((protocol.equals("ftp")) ? 21 : -1))));
@ -133,6 +137,46 @@ public class yacyURL implements Serializable {
port = -1;
quest = null;
ref = null;
} if (protocol.equals("file")) {
// parse file url
String h = url.substring(p + 1);
if (h.startsWith("//")) {
// host may be given, but may be also empty
final int q = h.indexOf('/', 2);
if (q <= 0) {
// no host given
host = null;
path = h.substring(2);
} else {
host = h.substring(2, q);
if (host.length() == 0 || host.equals("localhost")) host = null;
h = h.substring(q);
char c = h.charAt(2);
if (c == ':' || c == '|')
path = h.substring(1);
else
path = h;
}
} else {
host = null;
if (h.startsWith("/")) {
char c = h.charAt(2);
if (c == ':' || c == '|')
path = h.substring(1);
else
path = h;
} else {
char c = h.charAt(1);
if (c == ':' || c == '|')
path = h;
else
path = "/" + h;
}
}
userInfo = null;
port = -1;
quest = null;
ref = null;
} else {
throw new MalformedURLException("unknown protocol: " + url);
}
@ -158,10 +202,6 @@ public class yacyURL implements Serializable {
} catch (final PunycodeException e) {}
}
public yacyURL(final File file) throws MalformedURLException {
this("file", "", -1, file.getAbsolutePath());
}
public static yacyURL newURL(final String baseURL, final String relPath) throws MalformedURLException {
if ((baseURL == null) ||
(relPath.startsWith("http://")) ||
@ -541,6 +581,20 @@ public class yacyURL implements Serializable {
return path;
}
/**
* return the file object to a local file
* this patches also 'strange' windows file paths
* @return the file as absolute path
*/
public File getLocalFile() {
char c = path.charAt(1);
if (c == ':') return new File(path.replace('/', '\\'));
if (c == '|') return new File(path.charAt(0) + ":" + path.substring(2).replace('/', '\\'));
c = path.charAt(2);
if (c == ':' || c == '|') return new File(path.charAt(1) + ":" + path.substring(3).replace('/', '\\'));
return new File(path);
}
public String getAuthority() {
return ((port >= 0) && (host != null)) ? host + ":" + port : ((host != null) ? host : "");
}
@ -596,13 +650,16 @@ public class yacyURL implements Serializable {
if (this.port < 0 || this.port == 21) { defaultPort = true; }
} else if (this.protocol.equals("https")) {
if (this.port < 0 || this.port == 443) { defaultPort = true; }
} else if (this.protocol.equals("file")) {
defaultPort = true;
}
final String path = this.getFile(includeReference);
if (defaultPort) {
return this.protocol + "://" +
((this.userInfo != null) ? (this.userInfo + "@") : ("")) +
this.getHost().toLowerCase() + path;
return
this.protocol + ":" +
((this.getHost() == null) ? "" : "//" + ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + this.getHost().toLowerCase()) +
path;
}
return this.protocol + "://" +
((this.userInfo != null) ? (this.userInfo + "@") : ("")) +
@ -741,7 +798,7 @@ public class yacyURL implements Serializable {
final int id = serverDomains.getDomainID(this.host); // id=7: tld is local
final boolean isHTTP = this.protocol.equals("http");
int p = this.host.lastIndexOf('.');
int p = (host == null) ? -1 : this.host.lastIndexOf('.');
String dom = (p > 0) ? dom = host.substring(0, p) : "";
p = dom.lastIndexOf('.'); // locate subdomain
String subdom = "";
@ -797,7 +854,7 @@ public class yacyURL implements Serializable {
}
private static final String hosthash5(final String protocol, final String host, final int port) {
return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5);
return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(protocol + ((host == null) ? "" : (":" + host + ":" + port)))).substring(0, 5);
}
/**
@ -916,6 +973,7 @@ public class yacyURL implements Serializable {
// language calculation
public final String language() {
String language = "en";
if (host == null) return language;
final int pos = host.lastIndexOf(".");
if (pos > 0 && host.length() - pos == 3) language = host.substring(pos + 1).toLowerCase();
if (language.equals("uk")) language = "en";
@ -924,6 +982,12 @@ public class yacyURL implements Serializable {
public static void main(final String[] args) {
final String[][] test = new String[][]{
new String[]{null, "file://C:WINDOWS\\CMD0.EXE"},
new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
new String[]{null, "file:///bin/yacy2"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
new String[]{null, "file:C:WINDOWS\\CMD.EXE"},
new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"},
new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"},
new String[]{null, "http://www.anomic.de/test/"},
new String[]{null, "http://www.anomic.de/"},
new String[]{null, "http://www.anomic.de"},
@ -961,7 +1025,7 @@ public class yacyURL implements Serializable {
for (int i = 0; i < test.length; i++) {
environment = test[i][0];
url = test[i][1];
try {aURL = yacyURL.newURL(environment, url);} catch (final MalformedURLException e) {aURL = null;}
try {aURL = yacyURL.newURL(environment, url);} catch (final MalformedURLException e) {e.printStackTrace(); aURL = null;}
if (aURL != null) System.out.println("normalized: " + aURL.toNormalform(true, true));
if (environment == null) {
try {jURL = new java.net.URL(url);} catch (final MalformedURLException e) {jURL = null;}

Loading…
Cancel
Save