You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
193 lines
6.8 KiB
193 lines
6.8 KiB
16 years ago
|
// DocumentIndex.java
|
||
|
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||
|
// first published 14.09.2009 on http://yacy.net;
|
||
|
//
|
||
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
||
|
//
|
||
|
// $LastChangedDate: 2009-05-28 01:51:34 +0200 (Do, 28 Mai 2009) $
|
||
|
// $LastChangedRevision: 5988 $
|
||
|
// $LastChangedBy: orbiter $
|
||
|
//
|
||
|
// LICENSE
|
||
|
//
|
||
|
// This program is free software; you can redistribute it and/or modify
|
||
|
// it under the terms of the GNU General Public License as published by
|
||
|
// the Free Software Foundation; either version 2 of the License, or
|
||
|
// (at your option) any later version.
|
||
|
//
|
||
|
// This program is distributed in the hope that it will be useful,
|
||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
// GNU General Public License for more details.
|
||
|
//
|
||
|
// You should have received a copy of the GNU General Public License
|
||
|
// along with this program; if not, write to the Free Software
|
||
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
|
||
|
|
||
|
package de.anomic.kelondro.text;
|
||
|
|
||
|
import java.io.File;
|
||
|
import java.io.IOException;
|
||
|
import java.util.Date;
|
||
|
|
||
|
import de.anomic.document.Condenser;
|
||
|
import de.anomic.document.Document;
|
||
|
import de.anomic.document.Parser;
|
||
|
import de.anomic.document.ParserException;
|
||
|
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||
|
import de.anomic.search.QueryParams;
|
||
|
import de.anomic.search.RankingProfile;
|
||
|
import de.anomic.search.ResultEntry;
|
||
|
import de.anomic.search.SearchEvent;
|
||
|
import de.anomic.search.SearchEventCache;
|
||
|
import de.anomic.yacy.yacyURL;
|
||
|
import de.anomic.yacy.logging.Log;
|
||
|
|
||
|
/**
|
||
|
* convenience class to access the yacycore library from outside of yacy to put files into the index
|
||
|
* @author Michael Christen
|
||
|
*
|
||
|
*/
|
||
|
public class DocumentIndex extends Segment {
|
||
|
|
||
|
private RankingProfile textRankingDefault = new RankingProfile(QueryParams.CONTENTDOM_TEXT);
|
||
|
//private Bitfield zeroConstraint = new Bitfield(4);
|
||
|
|
||
|
public DocumentIndex(Log log, final File segmentPath) throws IOException {
|
||
|
super(log, segmentPath, 100000, targetFileSize * 4 - 1, false, false);
|
||
|
}
|
||
|
|
||
|
public DocumentIndex(final File segmentPath) throws IOException {
|
||
|
this(new Log("DocumentIndex"), segmentPath);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* put a single file into the index
|
||
|
* @param file
|
||
|
* @return a metadata object that has been generated to identify the file
|
||
|
* @throws IOException in case that the file does not exist or cannot be parsed
|
||
|
*/
|
||
|
public URLMetadataRow add(File file) throws IOException {
|
||
|
if (file == null) throw new IOException("file = null");
|
||
|
if (file.isDirectory()) throw new IOException("file should be a document, not a path");
|
||
|
if (!file.canRead()) throw new IOException("cannot read file");
|
||
|
yacyURL url = new yacyURL("file:" + file.getAbsolutePath());
|
||
|
Document document;
|
||
|
try {
|
||
|
document = Parser.parseSource(url, null, null, file);
|
||
|
} catch (InterruptedException e) {
|
||
|
throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage());
|
||
|
} catch (ParserException e) {
|
||
|
throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage());
|
||
|
}
|
||
|
final Condenser condenser = new Condenser(document, true, true);
|
||
|
return super.storeDocument(
|
||
|
url,
|
||
|
null,
|
||
|
new Date(file.lastModified()),
|
||
|
file.length(),
|
||
|
document,
|
||
|
condenser
|
||
|
);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* add a file or a directory of files to the index
|
||
|
* If the given file is a path to a directory, the complete sub-tree is indexed
|
||
|
* @param start
|
||
|
*/
|
||
|
public void addAll(File start) {
|
||
|
assert (start != null);
|
||
|
assert (start.canRead());
|
||
|
if (!start.isDirectory()) {
|
||
|
try {
|
||
|
add(start);
|
||
|
} catch (IOException e) {
|
||
|
e.printStackTrace();
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
String[] s = start.list();
|
||
|
File w;
|
||
|
for (String t: s) {
|
||
|
w = new File(start, t);
|
||
|
if (w.canRead() && ! w.isHidden()) {
|
||
|
if (w.isDirectory()) {
|
||
|
addAll(w);
|
||
|
} else {
|
||
|
try {
|
||
|
add(w);
|
||
|
} catch (IOException e) {
|
||
|
e.printStackTrace();
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* do a full-text search of a given string and return a specific number of results
|
||
|
* @param querystring
|
||
|
* @param pos
|
||
|
* @param count
|
||
|
* @return a list of files that contain the given string
|
||
|
*/
|
||
|
public File[] find(String querystring, int pos, int count) {
|
||
|
QueryParams query = new QueryParams(querystring, 100, textRankingDefault, null);
|
||
|
SearchEvent se = SearchEventCache.getEvent(query, this, null, null, null, false);
|
||
|
File[] result = new File[count];
|
||
|
ResultEntry re;
|
||
|
for (int i = 0; i < count; i++) {
|
||
|
re = se.oneResult(pos + i);
|
||
|
result[i] = (re == null) ? null : re.url().getLocalFile();
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* find the given string and return 20 hits
|
||
|
* @param querystring
|
||
|
* @return a list of files that contain the word
|
||
|
*/
|
||
|
public File[] find(String querystring) {
|
||
|
return find(querystring, 0, 20);
|
||
|
}
|
||
|
|
||
|
public static void main(String[] args) {
|
||
|
// first argument: path to segment
|
||
|
// second argument: either 'add' or 'search'
|
||
|
// third and more arguments exists only in case that second argument is 'search': these are then the search words
|
||
|
//
|
||
|
// example:
|
||
|
// DocumentIndex yacyindex add test/parsertest
|
||
|
// DocumentIndex yacyindex search steht
|
||
|
System.setProperty("java.awt.headless", "true");
|
||
|
if (args.length < 3) return;
|
||
|
File segmentPath = new File(args[0]);
|
||
|
System.out.println("using index files at " + segmentPath.getAbsolutePath());
|
||
|
try {
|
||
|
if (args[1].equals("add")) {
|
||
|
File f = new File(args[2]);
|
||
|
DocumentIndex di = new DocumentIndex(segmentPath);
|
||
|
di.addAll(f);
|
||
|
di.close();
|
||
|
} else {
|
||
|
String query = "";
|
||
|
for (int i = 2; i < args.length; i++) query += args[i];
|
||
|
query.trim();
|
||
|
DocumentIndex di = new DocumentIndex(segmentPath);
|
||
|
File[] results = di.find(query);
|
||
|
for (File f: results) {
|
||
|
if (f != null) System.out.println(f.toString());
|
||
|
}
|
||
|
di.close();
|
||
|
}
|
||
|
} catch (IOException e) {
|
||
|
e.printStackTrace();
|
||
|
}
|
||
|
//System.exit(0);
|
||
|
}
|
||
|
|
||
|
}
|