added missing files for last commit

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2756 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent a5dd0d41af
commit bdf4c7c51e

@ -0,0 +1,204 @@
// kelondroFlexSplitTable.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 12.10.2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.io.File;
import java.io.IOException;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
public class kelondroFlexSplitTable implements kelondroIndex {
// this is a set of kelondroFlex tables
// the set is divided into FlexTables with different entry date
private HashMap tables;
private kelondroOrder objectOrder;
private kelondroRow rowdef;
private File path;
private String tablename;
private long buffersize;
public kelondroFlexSplitTable(File path, String tablename, long buffersize, long preloadTime, kelondroRow rowdef, kelondroOrder objectOrder) throws IOException {
this.path = path;
this.tablename = tablename;
this.objectOrder = objectOrder;
this.rowdef = rowdef;
// initialized tables map
this.tables = new HashMap();
String[] dir = path.list();
String date;
// first pass: count tables
int count = 0;
for (int i = 0; i < dir.length; i++) if (dir[i].startsWith(tablename)) count++;
// second pass: open tables
for (int i = 0; i < dir.length; i++) {
if ((dir[i].startsWith(tablename)) &&
(dir[i].charAt(tablename.length()) == '.') &&
(dir[i].length() == tablename.length() + 7)) {
// open table
date = dir[i].substring(tablename.length() + 1);
this.tables.put(date, new kelondroFlexTable(path, dir[i], buffersize / count, preloadTime, rowdef, objectOrder));
}
}
}
private static final Calendar thisCalendar = Calendar.getInstance();
public static final String dateSuffix(Date date) {
int month, year;
StringBuffer suffix = new StringBuffer(6);
synchronized (thisCalendar) {
thisCalendar.setTime(date);
month = thisCalendar.get(Calendar.MONTH);
year = thisCalendar.get(Calendar.YEAR);
}
if ((year < 1970) && (year >= 70)) suffix.append("19").append(Integer.toString(year));
else if (year < 1970) suffix.append("20").append(Integer.toString(year));
else if (year > 3000) return null;
else suffix.append(Integer.toString(year));
if (month < 10) suffix.append("0").append(Integer.toString(month)); else suffix.append(Integer.toString(month));
return new String(suffix);
}
public kelondroOrder order() {
return this.objectOrder;
}
public synchronized int size() throws IOException {
Iterator i = tables.values().iterator();
int s = 0;
while (i.hasNext()) {
s += ((kelondroFlexTable) i.next()).size();
}
return s;
}
public kelondroRow row() throws IOException {
return this.rowdef;
}
public synchronized kelondroRow.Entry get(byte[] key) throws IOException {
Iterator i = tables.values().iterator();
kelondroFlexTable table;
kelondroRow.Entry entry;
while (i.hasNext()) {
table = (kelondroFlexTable) i.next();
entry = table.get(key);
if (entry != null) return entry;
}
return null;
}
public synchronized kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
return put(row, new Date()); // entry for current date
}
public synchronized kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
kelondroRow.Entry r = remove(row.getColBytes(0));
String suffix = dateSuffix(entryDate);
if (suffix == null) return null;
kelondroFlexTable table = (kelondroFlexTable) tables.get(suffix);
if (table == null) {
// make new table
table = new kelondroFlexTable(path, tablename + "." + suffix, buffersize / (tables.size() + 1), -1, rowdef, objectOrder);
tables.put(suffix, table);
}
table.put(row);
return r;
}
public synchronized kelondroRow.Entry remove(byte[] key) throws IOException {
Iterator i = tables.values().iterator();
kelondroFlexTable table;
kelondroRow.Entry entry;
while (i.hasNext()) {
table = (kelondroFlexTable) i.next();
entry = table.remove(key);
if (entry != null) return entry;
}
return null;
}
public synchronized Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException {
return new rowIter();
}
public class rowIter implements Iterator {
Iterator t, tt;
public rowIter() {
t = tables.values().iterator();
tt = null;
}
public boolean hasNext() {
return ((t.hasNext()) || ((tt != null) && (tt.hasNext())));
}
public Object next() {
if (t.hasNext()) {
if ((tt == null) || (!(tt.hasNext()))) {
try {
tt = ((kelondroFlexTable) t.next()).rows(true, false, null);
} catch (IOException e) {
return null;
}
}
if (tt.hasNext()) {
return tt.next();
} else {
return null;
}
}
return null;
}
public void remove() {
if (tt != null) tt.remove();
}
}
public synchronized void close() throws IOException {
Iterator i = tables.values().iterator();
while (i.hasNext()) ((kelondroFlexTable) i.next()).close();
tables = null;
}
public static void main(String[] args) {
System.out.println(dateSuffix(new Date()));
}
}

@ -0,0 +1,52 @@
package de.anomic.plasma;
import java.io.IOException;
import java.util.Date;
import de.anomic.net.URL;
import de.anomic.kelondro.kelondroRow;
import de.anomic.index.indexEntry;
public interface plasmaCrawlLURLEntry {
public kelondroRow.Entry toRowEntry() throws IOException;
public String hash();
public URL url();
public String descr();
public Date moddate();
public Date loaddate();
public String referrerHash();
public char doctype();
public int copyCount();
public boolean local();
public int quality();
public String language();
public int size();
public int wordCount();
public String snippet();
public indexEntry word();
public boolean isOlder(plasmaCrawlLURLEntry other);
public String toString(String snippet);
public String toString();
public void print();
}

@ -0,0 +1,343 @@
package de.anomic.plasma;
import java.io.IOException;
import java.util.Date;
import java.util.Properties;
import de.anomic.http.httpc;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.crypt;
public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
public static final kelondroRow rowdef = new kelondroRow(
"String urlhash-" + indexURL.urlHashLength + ", " + // the url's hash
"String urlstring-" + indexURL.urlStringLength + ", " + // the url as string
"String urldescr-" + indexURL.urlDescrLength + ", " + // the description of the url
"Cardinal moddate-" + indexURL.urlDateLength + " {b64e}, " + // last-modified from the httpd
"Cardinal loaddate-" + indexURL.urlDateLength + " {b64e}, " + // time when the url was loaded
"String refhash-" + indexURL.urlHashLength + ", " + // the url's referrer hash
"Cardinal copycount-" + indexURL.urlCopyCountLength + " {b64e}, " + //
"byte[] flags-" + indexURL.urlFlagLength + ", " + // flags
"Cardinal quality-" + indexURL.urlQualityLength + " {b64e}, " + //
"String language-" + indexURL.urlLanguageLength + ", " + //
"byte[] doctype-" + indexURL.urlDoctypeLength + ", " + //
"Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count
private URL url;
private String descr;
private Date moddate;
private Date loaddate;
private String urlHash;
private String referrerHash;
private int copyCount;
private String flags;
private int quality;
private String language;
private char doctype;
private int size;
private int wordCount;
private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests
// more needed attributes:
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes (see Word-Entity definition)
// - boolean: appearance of bold and/or italics
// - ETag: for re-crawl decision upon HEAD request
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: # of keywords
// - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications
public plasmaCrawlLURLOldEntry(URL url, String descr, Date moddate,
Date loaddate, String referrerHash, int copyCount,
boolean localNeed, int quality, String language, char doctype,
int size, int wordCount) {
// create new entry and store it into database
this.urlHash = indexURL.urlHash(url);
this.url = url;
this.descr = (descr == null) ? this.url.toString() : descr;
this.moddate = moddate;
this.loaddate = loaddate;
this.referrerHash = (referrerHash == null) ? indexURL.dummyHash : referrerHash;
this.copyCount = copyCount; // the number of remote (global) copies of this object without this one
this.flags = (localNeed) ? "L " : " ";
this.quality = quality;
this.language = (language == null) ? "uk" : language;
this.doctype = doctype;
this.size = size;
this.wordCount = wordCount;
this.snippet = null;
this.word = null;
}
public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
try {
this.urlHash = entry.getColString(0, null);
this.url = new URL(entry.getColString(1, "UTF-8").trim());
this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim();
this.moddate = new Date(86400000 * entry.getColLong(3));
this.loaddate = new Date(86400000 * entry.getColLong(4));
this.referrerHash = (entry.empty(5)) ? indexURL.dummyHash : entry.getColString(5, "UTF-8");
this.copyCount = (int) entry.getColLong(6);
this.flags = entry.getColString(7, "UTF-8");
this.quality = (int) entry.getColLong(8);
this.language = entry.getColString(9, "UTF-8");
this.doctype = (char) entry.getColByte(10);
this.size = (int) entry.getColLong(11);
this.wordCount = (int) entry.getColLong(12);
this.snippet = null;
this.word = searchedWord;
return;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
throw new IOException("plasmaLURL.entry/1: " + e.toString());
}
}
public plasmaCrawlLURLOldEntry(Properties prop, boolean setGlobal) {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
this.urlHash = prop.getProperty("hash", indexURL.dummyHash);
try {
this.referrerHash = prop.getProperty("referrer", indexURL.dummyHash);
this.moddate = indexURL.shortDayFormatter.parse(prop.getProperty("mod", "20000101"));
//System.out.println("DEBUG: moddate = " + moddate + ", prop=" + prop.getProperty("mod"));
this.loaddate = indexURL.shortDayFormatter.parse(prop.getProperty("load", "20000101"));
this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " ");
if (setGlobal) this.flags = "G ";
this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null);
if (this.descr == null) this.descr = this.url.toString();
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
this.language = prop.getProperty("lang", "uk");
this.doctype = prop.getProperty("dt", "t").charAt(0);
this.size = Integer.parseInt(prop.getProperty("size", "0"));
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null;
else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
} catch (Exception e) {
serverLog.logSevere("PLASMA",
"INTERNAL ERROR in plasmaLURL.entry/2:"
+ "\nProperties: "
+ ((prop == null) ? null : prop.toString())
+ ((prop.containsKey("word")) ? "\nWord: "
+ kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", "")) : "") + "\nErrorMsg: "
+ e.toString(), e);
}
}
public kelondroRow.Entry toRowEntry() throws IOException {
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexURL.urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexURL.urlDateLength);
final byte[][] entry = new byte[][] {
urlHash.getBytes(),
url.toString().getBytes(),
descr.getBytes(), // null?
moddatestr.getBytes(),
loaddatestr.getBytes(),
referrerHash.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, indexURL.urlCopyCountLength).getBytes(),
flags.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, indexURL.urlQualityLength).getBytes(),
language.getBytes(),
new byte[] { (byte) doctype },
kelondroBase64Order.enhancedCoder.encodeLong(size, indexURL.urlSizeLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, indexURL.urlWordCountLength).getBytes()};
return rowdef.newEntry(entry);
}
public String hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.urlHash;
}
public URL url() {
return url;
}
public String descr() {
return descr;
}
public Date moddate() {
return moddate;
}
public Date loaddate() {
return loaddate;
}
public String referrerHash() {
// return the creator's hash
return referrerHash;
}
public char doctype() {
return doctype;
}
public int copyCount() {
// return number of copies of this object in the global index
return copyCount;
}
public boolean local() {
// returns true if the url was created locally and is needed for own word index
if (flags == null) return false;
return flags.charAt(0) == 'L';
}
public int quality() {
return quality;
}
public String language() {
return language;
}
public int size() {
return size;
}
public int wordCount() {
return wordCount;
}
public String snippet() {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return snippet;
}
public indexEntry word() {
return word;
}
public boolean isOlder(plasmaCrawlLURLEntry other) {
if (other == null) return false;
if (moddate.before(other.moddate())) return true;
if (moddate.equals(other.moddate())) {
if (loaddate.before(other.loaddate())) return true;
if (loaddate.equals(other.loaddate())) {
if (quality < other.quality()) return true;
}
}
return false;
}
private StringBuffer corePropList() {
// generate a parseable string; this is a simple property-list
final StringBuffer corePropStr = new StringBuffer(300);
try {
corePropStr.append("hash=").append(urlHash).append(",referrer=")
.append(referrerHash).append(",mod=").append(
indexURL.shortDayFormatter.format(moddate)).append(
",load=").append(
indexURL.shortDayFormatter.format(loaddate))
.append(",size=").append(size).append(",wc=").append(
wordCount).append(",cc=").append(copyCount).append(
",local=").append(((local()) ? "true" : "false"))
.append(",q=").append(
kelondroBase64Order.enhancedCoder.encodeLong(
quality, indexURL.urlQualityLength))
.append(",dt=").append(doctype).append(",lang=").append(
language).append(",url=").append(
crypt.simpleEncode(url.toString())).append(
",descr=").append(crypt.simpleEncode(descr));
if (this.word != null) {
// append also word properties
corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false)));
}
return corePropStr;
} catch (Exception e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
// e.printStackTrace();
return null;
}
}
/*
public String toString(int posintext, int posinphrase, int posofphrase) {
// add information needed for remote transport
final StringBuffer core = corePropList();
if (core == null) return null;
core.ensureCapacity(core.length() + 200);
core.insert(0,"{")
.append(",posintext=").append(posintext)
.append(",posinphrase=").append(posinphrase)
.append(",posofphraseint=").append(posofphrase)
.append("}");
return core.toString();
}
*/
public String toString(String snippet) {
// add information needed for remote transport
final StringBuffer core = corePropList();
if (core == null)
return null;
core.ensureCapacity(core.length() + snippet.length() * 2);
core.insert(0, "{");
core.append(",snippet=").append(crypt.simpleEncode(snippet));
core.append("}");
return core.toString();
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
}
/**
* Returns this object as String.<br>
* This e.g. looks like this:
* <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
*/
public String toString() {
final StringBuffer core = corePropList();
if (core == null) return null;
core.insert(0, "{");
core.append("}");
return core.toString();
//return "{" + core + "}";
}
public void print() {
System.out.println("URL : " + url);
System.out.println("Description : " + descr);
System.out.println("Modified : " + httpc.dateString(moddate));
System.out.println("Loaded : " + httpc.dateString(loaddate));
System.out.println("Size : " + size + " bytes, " + wordCount
+ " words");
System.out.println("Referrer Hash : " + referrerHash);
System.out.println("Quality : " + quality);
System.out.println("Language : " + language);
System.out.println("DocType : " + doctype);
System.out.println();
}
}
Loading…
Cancel
Save