You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
298 lines
11 KiB
298 lines
11 KiB
16 years ago
|
// DocumentIndex.java
|
||
|
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||
|
// first published 14.09.2009 on http://yacy.net;
|
||
|
//
|
||
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
||
|
//
|
||
14 years ago
|
// $LastChangedDate$
|
||
|
// $LastChangedRevision$
|
||
|
// $LastChangedBy$
|
||
16 years ago
|
//
|
||
|
// LICENSE
|
||
|
//
|
||
|
// This program is free software; you can redistribute it and/or modify
|
||
|
// it under the terms of the GNU General Public License as published by
|
||
|
// the Free Software Foundation; either version 2 of the License, or
|
||
|
// (at your option) any later version.
|
||
|
//
|
||
|
// This program is distributed in the hope that it will be useful,
|
||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
// GNU General Public License for more details.
|
||
|
//
|
||
|
// You should have received a copy of the GNU General Public License
|
||
|
// along with this program; if not, write to the Free Software
|
||
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
|
||
|
|
||
14 years ago
|
package net.yacy.search.index;
|
||
16 years ago
|
|
||
|
import java.io.File;
|
||
|
import java.io.IOException;
|
||
15 years ago
|
import java.net.MalformedURLException;
|
||
16 years ago
|
import java.util.ArrayList;
|
||
16 years ago
|
import java.util.Date;
|
||
16 years ago
|
import java.util.concurrent.BlockingQueue;
|
||
16 years ago
|
import java.util.concurrent.LinkedBlockingQueue;
|
||
16 years ago
|
|
||
14 years ago
|
import net.yacy.cora.document.UTF8;
|
||
16 years ago
|
import net.yacy.document.Condenser;
|
||
|
import net.yacy.document.Document;
|
||
14 years ago
|
import net.yacy.document.LibraryProvider;
|
||
16 years ago
|
import net.yacy.document.TextParser;
|
||
16 years ago
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||
|
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||
15 years ago
|
import net.yacy.kelondro.data.meta.URIMetadataRow.Components;
|
||
16 years ago
|
import net.yacy.kelondro.logging.Log;
|
||
14 years ago
|
import net.yacy.search.query.QueryParams;
|
||
|
import net.yacy.search.query.SearchEvent;
|
||
|
import net.yacy.search.ranking.RankingProcess;
|
||
|
import net.yacy.search.ranking.RankingProfile;
|
||
|
import net.yacy.search.ranking.ReferenceOrder;
|
||
|
import net.yacy.search.snippet.ContentDomain;
|
||
16 years ago
|
|
||
16 years ago
|
|
||
|
/**
|
||
|
* convenience class to access the yacycore library from outside of yacy to put files into the index
|
||
|
* @author Michael Christen
|
||
|
*
|
||
|
*/
|
||
|
public class DocumentIndex extends Segment {
|
||
14 years ago
|
|
||
15 years ago
|
private static final RankingProfile textRankingDefault = new RankingProfile(ContentDomain.TEXT);
|
||
16 years ago
|
//private Bitfield zeroConstraint = new Bitfield(4);
|
||
14 years ago
|
|
||
15 years ago
|
private static DigestURI poison;
|
||
|
static {
|
||
|
try {
|
||
|
poison = new DigestURI("file://.");
|
||
14 years ago
|
} catch (final MalformedURLException e) {}
|
||
15 years ago
|
}
|
||
|
BlockingQueue<DigestURI> queue; // a queue of document ID's
|
||
14 years ago
|
private final Worker[] worker;
|
||
15 years ago
|
CallbackListener callback;
|
||
15 years ago
|
|
||
15 years ago
|
static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup");
|
||
14 years ago
|
|
||
|
|
||
|
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) throws IOException {
|
||
15 years ago
|
super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false);
|
||
14 years ago
|
final int cores = Runtime.getRuntime().availableProcessors() + 1;
|
||
16 years ago
|
this.callback = callback;
|
||
15 years ago
|
this.queue = new LinkedBlockingQueue<DigestURI>(cores * 300);
|
||
16 years ago
|
this.worker = new Worker[cores];
|
||
|
for (int i = 0; i < cores; i++) {
|
||
15 years ago
|
this.worker[i] = new Worker(i);
|
||
16 years ago
|
this.worker[i].start();
|
||
|
}
|
||
16 years ago
|
}
|
||
14 years ago
|
|
||
16 years ago
|
class Worker extends Thread {
|
||
14 years ago
|
public Worker(final int count) {
|
||
15 years ago
|
super(workerThreadGroup, "query-" + count);
|
||
|
}
|
||
14 years ago
|
|
||
14 years ago
|
@Override
|
||
16 years ago
|
public void run() {
|
||
15 years ago
|
DigestURI f;
|
||
14 years ago
|
URIMetadataRow[] resultRows;
|
||
16 years ago
|
try {
|
||
14 years ago
|
while ((f = DocumentIndex.this.queue.take()) != poison) try {
|
||
|
resultRows = add(f);
|
||
|
for (final URIMetadataRow resultRow: resultRows) {
|
||
|
if (DocumentIndex.this.callback != null) {
|
||
|
if (resultRow == null) {
|
||
|
DocumentIndex.this.callback.fail(f, "result is null");
|
||
|
} else {
|
||
|
DocumentIndex.this.callback.commit(f, resultRow);
|
||
|
}
|
||
16 years ago
|
}
|
||
|
}
|
||
14 years ago
|
} catch (final IOException e) {
|
||
15 years ago
|
if (e.getMessage().indexOf("cannot parse") < 0) Log.logException(e);
|
||
14 years ago
|
DocumentIndex.this.callback.fail(f, e.getMessage());
|
||
16 years ago
|
}
|
||
14 years ago
|
} catch (final InterruptedException e) {}
|
||
16 years ago
|
}
|
||
|
}
|
||
14 years ago
|
|
||
16 years ago
|
/**
|
||
|
* get the number of pending documents in the indexing queue
|
||
|
*/
|
||
|
public int pending() {
|
||
|
return this.queue.size();
|
||
|
}
|
||
14 years ago
|
|
||
16 years ago
|
public void clearQueue() {
|
||
|
this.queue.clear();
|
||
|
}
|
||
14 years ago
|
|
||
|
private URIMetadataRow[] add(final DigestURI url) throws IOException {
|
||
15 years ago
|
if (url == null) throw new IOException("file = null");
|
||
|
if (url.isDirectory()) throw new IOException("file should be a document, not a path");
|
||
|
if (!url.canRead()) throw new IOException("cannot read file");
|
||
15 years ago
|
Document[] documents;
|
||
15 years ago
|
long length;
|
||
16 years ago
|
try {
|
||
15 years ago
|
length = url.length();
|
||
14 years ago
|
} catch (final Exception e) {
|
||
15 years ago
|
length = -1;
|
||
|
}
|
||
|
try {
|
||
14 years ago
|
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1), true);
|
||
|
} catch (final Exception e) {
|
||
15 years ago
|
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
|
||
16 years ago
|
}
|
||
14 years ago
|
//Document document = Document.mergeDocuments(url, null, documents);
|
||
|
final URIMetadataRow[] rows = new URIMetadataRow[documents.length];
|
||
|
int c = 0;
|
||
|
for (final Document document: documents) {
|
||
|
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
|
||
|
rows[c++] = super.storeDocument(
|
||
16 years ago
|
url,
|
||
|
null,
|
||
15 years ago
|
new Date(url.lastModified()),
|
||
15 years ago
|
new Date(),
|
||
15 years ago
|
url.length(),
|
||
16 years ago
|
document,
|
||
15 years ago
|
condenser,
|
||
15 years ago
|
null,
|
||
|
DocumentIndex.class.getName() + ".add"
|
||
16 years ago
|
);
|
||
14 years ago
|
}
|
||
|
return rows;
|
||
16 years ago
|
}
|
||
14 years ago
|
|
||
16 years ago
|
/**
|
||
|
* add a file or a directory of files to the index
|
||
|
* If the given file is a path to a directory, the complete sub-tree is indexed
|
||
|
* @param start
|
||
|
*/
|
||
14 years ago
|
public void addConcurrent(final DigestURI start) throws IOException {
|
||
16 years ago
|
assert (start != null);
|
||
16 years ago
|
assert (start.canRead()) : start.toString();
|
||
16 years ago
|
if (!start.isDirectory()) {
|
||
|
try {
|
||
16 years ago
|
this.queue.put(start);
|
||
14 years ago
|
} catch (final InterruptedException e) {}
|
||
16 years ago
|
return;
|
||
|
}
|
||
14 years ago
|
final String[] s = start.list();
|
||
15 years ago
|
DigestURI w;
|
||
14 years ago
|
for (final String t: s) {
|
||
15 years ago
|
try {
|
||
|
w = new DigestURI(start, t);
|
||
|
if (w.canRead() && !w.isHidden()) {
|
||
|
if (w.isDirectory()) {
|
||
|
addConcurrent(w);
|
||
|
} else {
|
||
|
try {
|
||
|
this.queue.put(w);
|
||
14 years ago
|
} catch (final InterruptedException e) {}
|
||
15 years ago
|
}
|
||
16 years ago
|
}
|
||
14 years ago
|
} catch (final MalformedURLException e1) {
|
||
15 years ago
|
Log.logException(e1);
|
||
16 years ago
|
}
|
||
|
}
|
||
|
}
|
||
14 years ago
|
|
||
16 years ago
|
/**
|
||
|
* do a full-text search of a given string and return a specific number of results
|
||
|
* @param querystring
|
||
|
* @param count
|
||
|
* @return a list of files that contain the given string
|
||
14 years ago
|
*/
|
||
|
public ArrayList<DigestURI> find(final String querystring, int count) {
|
||
15 years ago
|
// make a query and start a search
|
||
14 years ago
|
final QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex");
|
||
|
final ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang));
|
||
|
final RankingProcess rankedCache = new RankingProcess(query, order, SearchEvent.max_results_preparation);
|
||
15 years ago
|
rankedCache.start();
|
||
14 years ago
|
|
||
15 years ago
|
// search is running; retrieve results
|
||
|
URIMetadataRow row;
|
||
14 years ago
|
final ArrayList<DigestURI> files = new ArrayList<DigestURI>();
|
||
15 years ago
|
Components metadata;
|
||
15 years ago
|
while ((row = rankedCache.takeURL(false, 1000)) != null) {
|
||
15 years ago
|
metadata = row.metadata();
|
||
|
if (metadata == null) continue;
|
||
15 years ago
|
files.add(metadata.url());
|
||
16 years ago
|
count--;
|
||
|
if (count == 0) break;
|
||
16 years ago
|
}
|
||
16 years ago
|
return files;
|
||
|
}
|
||
14 years ago
|
|
||
16 years ago
|
/**
|
||
|
* close the index.
|
||
|
* This terminates all worker threads and then closes the segment.
|
||
|
*/
|
||
14 years ago
|
@Override
|
||
16 years ago
|
public void close() {
|
||
16 years ago
|
// send termination signal to worker threads
|
||
14 years ago
|
for (@SuppressWarnings("unused") final Worker element : this.worker) {
|
||
16 years ago
|
try {
|
||
|
this.queue.put(poison);
|
||
14 years ago
|
} catch (final InterruptedException e) {}
|
||
16 years ago
|
}
|
||
16 years ago
|
// wait for termination
|
||
14 years ago
|
for (final Worker element : this.worker) {
|
||
16 years ago
|
try {
|
||
14 years ago
|
element.join();
|
||
|
} catch (final InterruptedException e) {}
|
||
16 years ago
|
}
|
||
|
// close the segment
|
||
|
super.close();
|
||
16 years ago
|
}
|
||
14 years ago
|
|
||
16 years ago
|
public interface CallbackListener {
|
||
15 years ago
|
public void commit(DigestURI f, URIMetadataRow resultRow);
|
||
|
public void fail(DigestURI f, String failReason);
|
||
16 years ago
|
}
|
||
14 years ago
|
|
||
|
public static void main(final String[] args) {
|
||
16 years ago
|
// first argument: path to segment
|
||
|
// second argument: either 'add' or 'search'
|
||
|
// third and more arguments exists only in case that second argument is 'search': these are then the search words
|
||
|
//
|
||
|
// example:
|
||
|
// DocumentIndex yacyindex add test/parsertest
|
||
|
// DocumentIndex yacyindex search steht
|
||
|
System.setProperty("java.awt.headless", "true");
|
||
|
if (args.length < 3) return;
|
||
14 years ago
|
final File segmentPath = new File(args[0]);
|
||
16 years ago
|
System.out.println("using index files at " + segmentPath.getAbsolutePath());
|
||
14 years ago
|
final CallbackListener callback = new CallbackListener() {
|
||
|
public void commit(final DigestURI f, final URIMetadataRow resultRow) {
|
||
16 years ago
|
System.out.println("indexed: " + f.toString());
|
||
|
}
|
||
14 years ago
|
public void fail(final DigestURI f, final String failReason) {
|
||
16 years ago
|
System.out.println("not indexed " + f.toString() + ": " + failReason);
|
||
|
}
|
||
16 years ago
|
};
|
||
16 years ago
|
try {
|
||
|
if (args[1].equals("add")) {
|
||
14 years ago
|
final DigestURI f = new DigestURI(args[2]);
|
||
|
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
|
||
16 years ago
|
di.addConcurrent(f);
|
||
16 years ago
|
di.close();
|
||
|
} else {
|
||
|
String query = "";
|
||
|
for (int i = 2; i < args.length; i++) query += args[i];
|
||
|
query.trim();
|
||
14 years ago
|
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
|
||
|
final ArrayList<DigestURI> results = di.find(query, 100);
|
||
|
for (final DigestURI f: results) {
|
||
16 years ago
|
if (f != null) System.out.println(f.toString());
|
||
|
}
|
||
|
di.close();
|
||
|
}
|
||
14 years ago
|
} catch (final IOException e) {
|
||
15 years ago
|
Log.logException(e);
|
||
16 years ago
|
}
|
||
|
//System.exit(0);
|
||
|
}
|
||
14 years ago
|
|
||
16 years ago
|
}
|