// DocumentIndex.java // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 14.09.2009 on http://yacy.net; // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate: 2009-05-28 01:51:34 +0200 (Do, 28 Mai 2009) $ // $LastChangedRevision: 5988 $ // $LastChangedBy: orbiter $ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package de.anomic.kelondro.text; import java.io.File; import java.io.IOException; import java.util.Date; import de.anomic.document.Condenser; import de.anomic.document.Document; import de.anomic.document.Parser; import de.anomic.document.ParserException; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.search.QueryParams; import de.anomic.search.RankingProfile; import de.anomic.search.ResultEntry; import de.anomic.search.SearchEvent; import de.anomic.search.SearchEventCache; import de.anomic.yacy.yacyURL; import de.anomic.yacy.logging.Log; /** * convenience class to access the yacycore library from outside of yacy to put files into the index * @author Michael Christen * */ public class DocumentIndex extends Segment { private RankingProfile textRankingDefault = new RankingProfile(QueryParams.CONTENTDOM_TEXT); //private Bitfield zeroConstraint = new Bitfield(4); public DocumentIndex(Log log, final File segmentPath) throws IOException { super(log, segmentPath, 100000, targetFileSize * 4 - 1, false, false); } public DocumentIndex(final File segmentPath) throws IOException { this(new Log("DocumentIndex"), segmentPath); } /** * put a single file into the index * @param file * @return a metadata object that has been generated to identify the file * @throws IOException in case that the file does not exist or cannot be parsed */ public URLMetadataRow add(File file) throws IOException { if (file == null) throw new IOException("file = null"); if (file.isDirectory()) throw new IOException("file should be a document, not a path"); if (!file.canRead()) throw new IOException("cannot read file"); yacyURL url = new yacyURL("file:" + file.getAbsolutePath()); Document document; try { document = Parser.parseSource(url, null, null, file); } catch (InterruptedException e) { throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage()); } catch (ParserException e) { throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage()); } final Condenser condenser = new Condenser(document, true, true); return super.storeDocument( url, null, new Date(file.lastModified()), file.length(), document, condenser ); } /** * add a file or a directory of files to the index * If the given file is a path to a directory, the complete sub-tree is indexed * @param start */ public void addAll(File start) { assert (start != null); assert (start.canRead()); if (!start.isDirectory()) { try { add(start); } catch (IOException e) { e.printStackTrace(); } return; } String[] s = start.list(); File w; for (String t: s) { w = new File(start, t); if (w.canRead() && ! w.isHidden()) { if (w.isDirectory()) { addAll(w); } else { try { add(w); } catch (IOException e) { e.printStackTrace(); } } } } } /** * do a full-text search of a given string and return a specific number of results * @param querystring * @param pos * @param count * @return a list of files that contain the given string */ public File[] find(String querystring, int pos, int count) { QueryParams query = new QueryParams(querystring, 100, textRankingDefault, null); SearchEvent se = SearchEventCache.getEvent(query, this, null, null, null, false); File[] result = new File[count]; ResultEntry re; for (int i = 0; i < count; i++) { re = se.oneResult(pos + i); result[i] = (re == null) ? null : re.url().getLocalFile(); } return result; } /** * find the given string and return 20 hits * @param querystring * @return a list of files that contain the word */ public File[] find(String querystring) { return find(querystring, 0, 20); } public static void main(String[] args) { // first argument: path to segment // second argument: either 'add' or 'search' // third and more arguments exists only in case that second argument is 'search': these are then the search words // // example: // DocumentIndex yacyindex add test/parsertest // DocumentIndex yacyindex search steht System.setProperty("java.awt.headless", "true"); if (args.length < 3) return; File segmentPath = new File(args[0]); System.out.println("using index files at " + segmentPath.getAbsolutePath()); try { if (args[1].equals("add")) { File f = new File(args[2]); DocumentIndex di = new DocumentIndex(segmentPath); di.addAll(f); di.close(); } else { String query = ""; for (int i = 2; i < args.length; i++) query += args[i]; query.trim(); DocumentIndex di = new DocumentIndex(segmentPath); File[] results = di.find(query); for (File f: results) { if (f != null) System.out.println(f.toString()); } di.close(); } } catch (IOException e) { e.printStackTrace(); } //System.exit(0); } }