diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java index 8cecc9a31..25bb87950 100644 --- a/htroot/IndexImport_p.java +++ b/htroot/IndexImport_p.java @@ -98,7 +98,7 @@ public final class IndexImport_p { if (startImport) { dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType); if (importerThread != null) { - importerThread.init(new File(importPath), switchboard.indexPublicTextPath, cacheSize, 100); + importerThread.init(new File(importPath), switchboard.indexPath, cacheSize, 100); importerThread.startIt(); } prop.put("LOCATION",""); diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index cf29c69e7..3a64ea1b6 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -73,6 +73,7 @@ import de.anomic.server.serverMemory; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.logging.serverLog; +import de.anomic.tools.bitfield; import de.anomic.tools.dirlistComparator; import de.anomic.tools.md5DirFileFilter; import de.anomic.yacy.yacyCore; @@ -174,14 +175,15 @@ public class dir { final byte[] binary = (byte[]) post.get("file$file", new byte[0]); try { serverFileUtils.write(binary, newfile); - String md5s = serverCodings.encodeMD5Hex(newfile); + byte[] md5 = serverCodings.encodeMD5Raw(newfile); + String md5s = serverCodings.encodeHex(md5); serverFileUtils.write((md5s + "\n" + description).getBytes("UTF-8"), newfilemd5); // generate md5 // index file info if (post.get("indexing", "").equals("on")) { final String urlstring = yacyhURL(yacyCore.seedDB.mySeed, filename, md5s); final String phrase = filename.replace('.', ' ').replace('_', ' ').replace('-', ' '); - indexPhrase(switchboard, urlstring, phrase, description); + indexPhrase(switchboard, urlstring, phrase, description, md5); } } catch (IOException e) {} } else if (action.equals("newdir") && (uploadAuthorization || adminAuthorization)) { @@ -354,20 +356,27 @@ public class dir { return "http://share." + seed.getHexHash() + ".yacyh/" + filename + "?md5=" + md5; } - public static void indexPhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) { + public static void indexPhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr, byte[] md5) { try { final URL url = new URL(urlstring); final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry( - url.toNormalform(), "YaCyShare: " + descr, new Date(), new Date(), - "AAAAAAAAAAAA", /*referrer*/ - 0, /*copycount*/ - false, /*localneed*/ - condenser.RESULT_WORD_ENTROPHY, - "**", /*language*/ - indexEntryAttribute.DT_SHARE, /*doctype*/ - phrase.length(), /*size*/ - condenser.RESULT_NUMB_WORDS + url, + "YaCyShare: " + descr, + yacyCore.seedDB.mySeed.getName(), + "", // tags + "", // ETag + new Date(), // modification + new Date(), // loadtime + new Date(), // freshtime + "AAAAAAAAAAAA", // referrer + md5, // md5 + (long) phrase.length(), // size + condenser.RESULT_NUMB_WORDS, // word count + indexEntryAttribute.DT_SHARE, // doctype + new bitfield(4), + "**", // language + 0,0,0,0,0,0 ); switchboard.urlPool.loadedURL.store(newEntry); switchboard.urlPool.loadedURL.stack( diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 57d295c29..1973adbb9 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -124,7 +124,7 @@ public final class crawlReceipt { prop.put("delay", "3600"); } else if (result.equals("fill")) { // generating a new loaded URL entry - plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true); + plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr); if (entry == null) { log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam + "\n\tURL properties: "+ propStr); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index ac551bc81..a09388ff7 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -97,7 +97,7 @@ public final class transferURL { if (urls == null) { yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName); } else { - lEntry = sb.urlPool.loadedURL.newEntry(urls, true); + lEntry = sb.urlPool.loadedURL.newEntry(urls); if (lEntry == null) { yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); // TODO: should we send back an error message??? diff --git a/source/dbtest.java b/source/dbtest.java index c943be5af..debcb37e8 100644 --- a/source/dbtest.java +++ b/source/dbtest.java @@ -550,6 +550,14 @@ final class dbTable implements kelondroIndex { } } + public synchronized void addUnique(kelondroRow.Entry row) throws IOException { + throw new UnsupportedOperationException(); + } + + public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException { + throw new UnsupportedOperationException(); + } + public kelondroRow.Entry remove(byte[] key) throws IOException { try { diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java index 60a9f6158..3a3e10846 100644 --- a/source/de/anomic/index/indexContainer.java +++ b/source/de/anomic/index/indexContainer.java @@ -78,7 +78,7 @@ public class indexContainer extends kelondroRowSet { } public int add(indexEntry entry) { - this.add(entry.toKelondroEntry()); + this.addUnique(entry.toKelondroEntry()); return 1; } diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java index 41cbfda73..b2f0122e5 100644 --- a/source/de/anomic/index/indexURL.java +++ b/source/de/anomic/index/indexURL.java @@ -425,16 +425,6 @@ public class indexURL { } } - public boolean remove(String hash) { - if (hash == null) return false; - try { - urlIndexFile.remove(hash.getBytes()); - return true; - } catch (IOException e) { - return false; - } - } - public void close() throws IOException { if (urlIndexFile != null) { urlIndexFile.close(); diff --git a/source/de/anomic/kelondro/kelondroAttrSeq.java b/source/de/anomic/kelondro/kelondroAttrSeq.java index 0da55212d..1d358b718 100644 --- a/source/de/anomic/kelondro/kelondroAttrSeq.java +++ b/source/de/anomic/kelondro/kelondroAttrSeq.java @@ -411,7 +411,7 @@ public class kelondroAttrSeq { kelondroRowCollection collection = new kelondroRowCollection(structure.seqrow, seq.size()); Iterator i = seq.iterator(); while (i.hasNext()) { - collection.add(structure.seqrow.newEntry(((String) i.next()).getBytes())); + collection.addUnique(structure.seqrow.newEntry(((String) i.next()).getBytes())); } return collection; } diff --git a/source/de/anomic/kelondro/kelondroBufferedIndex.java b/source/de/anomic/kelondro/kelondroBufferedIndex.java index bfda75832..bdc44ce83 100644 --- a/source/de/anomic/kelondro/kelondroBufferedIndex.java +++ b/source/de/anomic/kelondro/kelondroBufferedIndex.java @@ -90,26 +90,21 @@ public class kelondroBufferedIndex implements kelondroIndex { } public synchronized kelondroRow.Entry get(byte[] key) throws IOException { - long handle = index.profile().startRead(); + long handle = (index instanceof kelondroFlexSplitTable) ? -1 : index.profile().startRead(); kelondroRow.Entry entry = null; entry = (kelondroRow.Entry) buffer.get(key); if (entry == null) entry = index.get(key); - index.profile().stopRead(handle); + if (handle >= 0) index.profile().stopRead(handle); return entry; } - public synchronized void add(kelondroRow.Entry newentry) throws IOException { - assert (index instanceof kelondroRowSet); - ((kelondroRowSet) index).add(newentry); + public synchronized kelondroRow.Entry put(kelondroRow.Entry row) throws IOException { + return put(row, null); } public synchronized kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { - return put(row); - } - - public synchronized kelondroRow.Entry put(kelondroRow.Entry newentry) throws IOException { - long handle = index.profile().startWrite(); - byte[] key = newentry.getColBytes(index.primarykey()); + long handle = (index instanceof kelondroFlexSplitTable) ? -1 : index.profile().startWrite(); + byte[] key = row.getColBytes(index.primarykey()); kelondroRow.Entry oldentry = null; oldentry = (kelondroRow.Entry) buffer.get(key); if (oldentry == null) { @@ -117,45 +112,67 @@ public class kelondroBufferedIndex implements kelondroIndex { oldentry = index.get(key); if (oldentry == null) { // this was not anywhere - buffer.put(key, newentry); - if (((buffer.size() > bufferFlushMinimum) && (serverMemory.available() > memBlockLimit)) - || (buffer.size() > bufferFlushLimit)) - flush(); + if (entryDate == null) { + buffer.put(key, row); + if (((buffer.size() > bufferFlushMinimum) && (serverMemory.available() > memBlockLimit)) + || (buffer.size() > bufferFlushLimit)) + flush(); + } else { + index.put(row, entryDate); + } } else { // replace old entry - index.put(newentry); + if (entryDate == null) { + index.put(row); + } else { + index.put(row, entryDate); + } } } else { // the entry is already in buffer // simply replace old entry - buffer.put(key, newentry); + if (entryDate == null) { + buffer.put(key, row); + } else { + buffer.remove(key); + index.put(row, entryDate); + } } - index.profile().stopWrite(handle); + if (handle >= 0) index.profile().stopWrite(handle); return oldentry; } + public synchronized void addUnique(kelondroRow.Entry row) throws IOException { + assert (index instanceof kelondroRowSet); + ((kelondroRowSet) index).addUnique(row); + } + + public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException { + addUnique(row); + } + public synchronized kelondroRow.Entry remove(byte[] key) throws IOException { - long handle = index.profile().startDelete(); + long handle = (index instanceof kelondroFlexSplitTable) ? -1 : index.profile().startDelete(); kelondroRow.Entry oldentry = null; oldentry = (kelondroRow.Entry) buffer.remove(key); if (oldentry == null) { // try the collection return index.remove(key); } - index.profile().stopDelete(handle); + if (handle >= 0) index.profile().stopDelete(handle); return oldentry; } public synchronized kelondroRow.Entry removeOne() throws IOException { - long handle = index.profile().startDelete(); + long handle = (index instanceof kelondroFlexSplitTable) ? -1 : index.profile().startDelete(); if (buffer.size() > 0) { byte[] key = (byte[]) buffer.keySet().iterator().next(); kelondroRow.Entry entry = (kelondroRow.Entry) buffer.remove(key); - index.profile().stopDelete(handle); + if (handle >= 0) index.profile().stopDelete(handle); return entry; } else { kelondroRow.Entry entry = index.removeOne(); - index.profile().stopDelete(handle); + if (handle >= 0) index.profile().stopDelete(handle); return entry; } } diff --git a/source/de/anomic/kelondro/kelondroBytesIntMap.java b/source/de/anomic/kelondro/kelondroBytesIntMap.java index 3fa5837d4..7e920d0e3 100644 --- a/source/de/anomic/kelondro/kelondroBytesIntMap.java +++ b/source/de/anomic/kelondro/kelondroBytesIntMap.java @@ -52,6 +52,13 @@ public class kelondroBytesIntMap { return (int) oldentry.getColLong(1); } + public synchronized void addi(byte[] key, int i) throws IOException { + kelondroRow.Entry newentry = ki.row().newEntry(); + newentry.setCol(0, key); + newentry.setCol(1, i); + ki.addUnique(newentry); + } + public synchronized int removei(byte[] key) throws IOException { // returns the integer index of the key, if the key can be found and was removed // and -1 if the key was not found. diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java index 7a73b35eb..3cac11a2e 100644 --- a/source/de/anomic/kelondro/kelondroCollectionIndex.java +++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java @@ -159,7 +159,7 @@ public class kelondroCollectionIndex { ientry.setCol(idx_col_lastread, t); ientry.setCol(idx_col_lastwrote, t); if (index instanceof kelondroBufferedIndex) - ((kelondroBufferedIndex) index).add(ientry); + ((kelondroBufferedIndex) index).addUnique(ientry); else index.put(ientry); @@ -540,13 +540,13 @@ public class kelondroCollectionIndex { // fill index with values kelondroRowSet collection = new kelondroRowSet(rowdef); - collection.add(rowdef.newEntry(new byte[][]{"abc".getBytes(), "efg".getBytes()})); + collection.addUnique(rowdef.newEntry(new byte[][]{"abc".getBytes(), "efg".getBytes()})); collectionIndex.put("erstes".getBytes(), collection); for (int i = 0; i <= 17; i++) { collection = new kelondroRowSet(rowdef); for (int j = 0; j < i; j++) { - collection.add(rowdef.newEntry(new byte[][]{("abc" + j).getBytes(), "xxx".getBytes()})); + collection.addUnique(rowdef.newEntry(new byte[][]{("abc" + j).getBytes(), "xxx".getBytes()})); } System.out.println("put key-" + i + ": " + collection.toString()); collectionIndex.put(("key-" + i).getBytes(), collection); @@ -556,7 +556,7 @@ public class kelondroCollectionIndex { for (int i = 0; i <= 17; i++) { collection = new kelondroRowSet(rowdef); for (int j = 0; j < i; j++) { - collection.add(rowdef.newEntry(new byte[][]{("def" + j).getBytes(), "xxx".getBytes()})); + collection.addUnique(rowdef.newEntry(new byte[][]{("def" + j).getBytes(), "xxx".getBytes()})); } collectionIndex.merge(("key-" + i).getBytes(), collection); } diff --git a/source/de/anomic/kelondro/kelondroFlexSplitTable.java b/source/de/anomic/kelondro/kelondroFlexSplitTable.java index 2a7442191..d7faace93 100644 --- a/source/de/anomic/kelondro/kelondroFlexSplitTable.java +++ b/source/de/anomic/kelondro/kelondroFlexSplitTable.java @@ -53,6 +53,7 @@ public class kelondroFlexSplitTable implements kelondroIndex { // initialized tables map this.tables = new HashMap(); + if (!(path.exists())) path.mkdirs(); String[] dir = path.list(); String date; @@ -78,7 +79,7 @@ public class kelondroFlexSplitTable implements kelondroIndex { StringBuffer suffix = new StringBuffer(6); synchronized (thisCalendar) { thisCalendar.setTime(date); - month = thisCalendar.get(Calendar.MONTH); + month = thisCalendar.get(Calendar.MONTH) + 1; year = thisCalendar.get(Calendar.YEAR); } if ((year < 1970) && (year >= 70)) suffix.append("19").append(Integer.toString(year)); @@ -136,7 +137,6 @@ public class kelondroFlexSplitTable implements kelondroIndex { public synchronized kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { kelondroRow.Entry r = remove(row.getColBytes(0)); - String suffix = dateSuffix(entryDate); if (suffix == null) return null; kelondroFlexTable table = (kelondroFlexTable) tables.get(suffix); @@ -150,6 +150,22 @@ public class kelondroFlexSplitTable implements kelondroIndex { return r; } + public synchronized void addUnique(kelondroRow.Entry row) throws IOException { + addUnique(row, new Date()); + } + + public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException { + String suffix = dateSuffix(entryDate); + if (suffix == null) return; + kelondroFlexTable table = (kelondroFlexTable) tables.get(suffix); + if (table == null) { + // make new table + table = new kelondroFlexTable(path, tablename + "." + suffix, buffersize / (tables.size() + 1), -1, rowdef, objectOrder); + tables.put(suffix, table); + } + table.addUnique(row, entryDate); + } + public synchronized kelondroRow.Entry remove(byte[] key) throws IOException { Iterator i = tables.values().iterator(); kelondroFlexTable table; diff --git a/source/de/anomic/kelondro/kelondroFlexTable.java b/source/de/anomic/kelondro/kelondroFlexTable.java index 6712eddef..04aa47c61 100644 --- a/source/de/anomic/kelondro/kelondroFlexTable.java +++ b/source/de/anomic/kelondro/kelondroFlexTable.java @@ -93,7 +93,7 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr indexentry = ri.row().newEntry(); indexentry.setCol(0, node.getValueRow()); indexentry.setCol(1, i); - ri.add(indexentry); + ri.addUnique(indexentry); if ((i % 10000) == 0) { System.out.print('.'); System.out.flush(); @@ -139,7 +139,7 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr return super.get(i); } - public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { + public synchronized kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { return put(row); } @@ -152,6 +152,14 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr return super.set(i, row); } + public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException { + addUnique(row); + } + + public synchronized void addUnique(kelondroRow.Entry row) throws IOException { + index.addi(row.getColBytes(0), super.add(row)); + } + public synchronized kelondroRow.Entry remove(byte[] key) throws IOException { int i = index.removei(key); if (i < 0) return null; diff --git a/source/de/anomic/kelondro/kelondroIndex.java b/source/de/anomic/kelondro/kelondroIndex.java index ab4fc76c2..bc77629a8 100644 --- a/source/de/anomic/kelondro/kelondroIndex.java +++ b/source/de/anomic/kelondro/kelondroIndex.java @@ -64,6 +64,8 @@ public interface kelondroIndex { public kelondroRow.Entry get(byte[] key) throws IOException; public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException; public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException; + public void addUnique(kelondroRow.Entry row) throws IOException; // no double-check + public void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException; // no double-check public kelondroRow.Entry remove(byte[] key) throws IOException; public kelondroRow.Entry removeOne() throws IOException; public Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException; diff --git a/source/de/anomic/kelondro/kelondroIntBytesMap.java b/source/de/anomic/kelondro/kelondroIntBytesMap.java index 99c5a9137..39f5ad330 100644 --- a/source/de/anomic/kelondro/kelondroIntBytesMap.java +++ b/source/de/anomic/kelondro/kelondroIntBytesMap.java @@ -58,7 +58,7 @@ public class kelondroIntBytesMap { newentry = index.row().newEntry(); newentry.setCol(0, (long) ii); newentry.setCol(1, value); - index.add(newentry); + index.addUnique(newentry); } catch (IOException e) {} } diff --git a/source/de/anomic/kelondro/kelondroRAMIndex.java b/source/de/anomic/kelondro/kelondroRAMIndex.java index 9e85aa4f8..ab090c136 100644 --- a/source/de/anomic/kelondro/kelondroRAMIndex.java +++ b/source/de/anomic/kelondro/kelondroRAMIndex.java @@ -75,6 +75,14 @@ public class kelondroRAMIndex implements kelondroIndex { return (kelondroRow.Entry) index.put(row.getColBytes(0), row); } + public synchronized void addUnique(kelondroRow.Entry row) throws IOException { + throw new UnsupportedOperationException(); + } + + public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException { + throw new UnsupportedOperationException(); + } + public synchronized Entry remove(byte[] key) { return (kelondroRow.Entry) index.remove(key); } diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java index 963dc0314..6afc714f5 100644 --- a/source/de/anomic/kelondro/kelondroRecords.java +++ b/source/de/anomic/kelondro/kelondroRecords.java @@ -148,7 +148,7 @@ public class kelondroRecords { protected int readHit, readMiss, writeUnique, writeDouble, cacheDelete, cacheFlush; // optional logger - protected Logger theLogger = null; + protected Logger theLogger = Logger.getLogger("KELONDRO"); // default logger // tracking of file cration protected boolean fileExisted; @@ -1046,19 +1046,38 @@ public class kelondroRecords { synchronized (USAGE) { if (USAGE.FREEC != 0) { Handle h = USAGE.FREEH; + long repair_position = POS_FREEH; + int iter = 0; while (h.index != NUL) { - //System.out.println("handle=0x" + Integer.toHexString(h.index)); + // check handle + seekp = seekpos(h); + if (seekp > entryFile.length()) { + // repair last hande store position + this.theLogger.severe("KELONDRO WARNING " + this.filename + ": seek position " + seekp + "/" + h.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize) + " after " + iter + " iterations"); + entryFile.writeInt(repair_position, NUL); + return markedDeleted; + } + + // handle seems to be corrent. store handle + markedDeleted.add(h); + + // move to next handle + repair_position = seekp; + h = new Handle(entryFile.readInt(seekp)); + + // double-check for already stored handles: detect loops if (markedDeleted.contains(h)) { // loop detection this.theLogger.severe("KELONDRO WARNING " + this.filename + ": FREE-Queue contains loops"); - return markedDeleted; // TODO: automatic fix + entryFile.writeInt(repair_position, NUL); + return markedDeleted; } - markedDeleted.add(h); - seekp = seekpos(h); - if (seekp > entryFile.length()) throw new kelondroException("deletedHandles: seek position " + seekp + "/" + h.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize)); - h = new Handle(entryFile.readInt(seekp)); + + // this appears to be correct. go on. + iter++; if (System.currentTimeMillis() > timeLimit) throw new kelondroException(filename, "time limit of " + maxTime + " exceeded; > " + markedDeleted.size() + " deleted entries"); } + System.out.println("\nDEBUG: " + iter + " deleted entries in " + entryFile.name()); } } return markedDeleted; diff --git a/source/de/anomic/kelondro/kelondroRow.java b/source/de/anomic/kelondro/kelondroRow.java index f172aebcf..030c1df5a 100644 --- a/source/de/anomic/kelondro/kelondroRow.java +++ b/source/de/anomic/kelondro/kelondroRow.java @@ -243,6 +243,13 @@ public class kelondroRow { return rowinstance[colstart[column]] == 0; } + public void setCol(String nickname, char c) { + if (nickref == null) genNickRef(); + Object[] ref = (Object[]) nickref.get(nickname); + if (ref == null) return; + rowinstance[((Integer) ref[1]).intValue()] = (byte) c; + } + public void setCol(String nickname, byte[] cell) { if (nickref == null) genNickRef(); Object[] ref = (Object[]) nickref.get(nickname); diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java index 94b4df676..c221dbbc9 100644 --- a/source/de/anomic/kelondro/kelondroRowCollection.java +++ b/source/de/anomic/kelondro/kelondroRowCollection.java @@ -24,6 +24,7 @@ package de.anomic.kelondro; +import java.util.Date; import java.util.Iterator; import java.util.Set; @@ -198,10 +199,14 @@ public class kelondroRowCollection { this.lastTimeWrote = System.currentTimeMillis(); } - public void add(kelondroRow.Entry a) { - add(a.bytes(), 0, a.bytes().length); + public void addUnique(kelondroRow.Entry row) { + add(row.bytes(), 0, row.bytes().length); } + public void addUnique(kelondroRow.Entry row, Date entryDate) { + addUnique(row); + } + public void add(byte[] a) { add(a, 0, a.length); } @@ -225,7 +230,7 @@ public class kelondroRowCollection { kelondroRow.Entry entry; while (i.hasNext()) { entry = (kelondroRow.Entry) i.next(); - add(entry); + addUnique(entry); } } diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java index 93f746daa..cd6758df4 100644 --- a/source/de/anomic/kelondro/kelondroRowSet.java +++ b/source/de/anomic/kelondro/kelondroRowSet.java @@ -97,7 +97,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd set(index, entry); removeMarker.remove(new Integer(index)); } else if (index < 0) { - add(entry); + addUnique(entry); } else { oldentry = get(index); set(index, entry); diff --git a/source/de/anomic/kelondro/kelondroSplittedTree.java b/source/de/anomic/kelondro/kelondroSplittedTree.java index 06b5e3673..02eebdaa2 100644 --- a/source/de/anomic/kelondro/kelondroSplittedTree.java +++ b/source/de/anomic/kelondro/kelondroSplittedTree.java @@ -114,6 +114,14 @@ public class kelondroSplittedTree implements kelondroIndex { return put(row); } + public synchronized void addUnique(kelondroRow.Entry row) throws IOException { + throw new UnsupportedOperationException(); + } + + public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException { + throw new UnsupportedOperationException(); + } + public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException { return ktfs[partition(row.getColBytes(0))].put(row); } diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java index 0216136b9..bcaedd7e5 100644 --- a/source/de/anomic/kelondro/kelondroTree.java +++ b/source/de/anomic/kelondro/kelondroTree.java @@ -553,6 +553,14 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex { return result; } + public synchronized void addUnique(kelondroRow.Entry row) throws IOException { + throw new UnsupportedOperationException(); + } + + public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException { + throw new UnsupportedOperationException(); + } + private void assignChild(Node parentNode, Node childNode, int childType) throws IOException { parentNode.setOHHandle(childType, childNode.handle()); childNode.setOHHandle(parent, parentNode.handle()); diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index fa8fccde6..ec3668459 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -75,9 +75,9 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { } this.log.logFine("Initializing source word index db."); - this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2, this.log, sb.getConfigBool("useCollectionIndex", false)); + this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, true, (this.cacheSize/2)/1024, preloadTime / 2, this.log, sb.getConfigBool("useCollectionIndex", false)); this.log.logFine("Initializing import URL db."); - this.importUrlDB = new plasmaCrawlLURL(this.importPath, (this.cacheSize/2)/1024, preloadTime / 2, false); + this.importUrlDB = new plasmaCrawlLURL(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2, false); this.importStartSize = this.importWordIndex.size(); } diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index b41ceefce..838f2452e 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -171,6 +171,16 @@ public class plasmaCrawlEURL extends indexURL { return new Entry(url, referrer, initiator, executor, name, failreason, flags); } + public boolean remove(String hash) { + if (hash == null) return false; + try { + urlIndexFile.remove(hash.getBytes()); + return true; + } catch (IOException e) { + return false; + } + } + public synchronized void stackPushEntry(Entry e) { rejectedStack.add(e.hash); } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 52376ec02..a6ec43a8c 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -67,6 +67,8 @@ import de.anomic.http.httpc.response; import de.anomic.index.indexEntry; import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroBufferedIndex; +import de.anomic.kelondro.kelondroFlexSplitTable; +import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroTree; import de.anomic.net.URL; @@ -74,6 +76,7 @@ import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCodings; import de.anomic.server.serverObjects; import de.anomic.server.logging.serverLog; +import de.anomic.tools.bitfield; import de.anomic.tools.nxTools; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; @@ -90,14 +93,22 @@ public final class plasmaCrawlLURL extends indexURL { private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling private final LinkedList gcrawlResultStack; // 6 - local index: triggered external - public plasmaCrawlLURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) { + private boolean newdb; + + public plasmaCrawlLURL(File plasmaPath, File indexPath, int bufferkb, long preloadTime, boolean newdb) { super(); - - File cacheFile = new File(cachePath, "urlHash.db"); + this.newdb = newdb; - cacheFile.getParentFile().mkdirs(); try { - urlIndexFile = new kelondroBufferedIndex(new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, plasmaCrawlLURLOldEntry.rowdef)); + if (newdb) { + urlIndexFile = new kelondroBufferedIndex( + new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, plasmaCrawlLURLNewEntry.rowdef, kelondroBase64Order.enhancedCoder)); + } else { + File oldLURLDB = new File(plasmaPath, "urlHash.db"); + oldLURLDB.getParentFile().mkdirs(); + urlIndexFile = new kelondroBufferedIndex( + new kelondroTree(oldLURLDB, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, plasmaCrawlLURLOldEntry.rowdef)); + } } catch (IOException e) { e.printStackTrace(); System.exit(-1); @@ -133,21 +144,21 @@ public final class plasmaCrawlLURL extends indexURL { } } - public void notifyGCrawl(String urlHash, String initiatorHash, String executorHash) { + public synchronized void notifyGCrawl(String urlHash, String initiatorHash, String executorHash) { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } - public void flushCacheSome() { + public synchronized void flushCacheSome() { try { ((kelondroBufferedIndex) urlIndexFile).flushSome(); } catch (IOException e) {} } - public int writeCacheSize() { + public synchronized int writeCacheSize() { return ((kelondroBufferedIndex) urlIndexFile).writeBufferSize(); } - public plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) { + public synchronized plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -158,13 +169,16 @@ public final class plasmaCrawlLURL extends indexURL { try { kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes()); if (entry == null) return null; - return new plasmaCrawlLURLOldEntry(entry, searchedWord); + if (newdb) + return new plasmaCrawlLURLNewEntry(entry, searchedWord); + else + return new plasmaCrawlLURLOldEntry(entry, searchedWord); } catch (IOException e) { return null; } } - public void store(plasmaCrawlLURLEntry entry) throws IOException { + public synchronized void store(plasmaCrawlLURLEntry entry) throws IOException { // Check if there is a more recent Entry already in the DB plasmaCrawlLURLEntry oldEntry; try { @@ -187,23 +201,48 @@ public final class plasmaCrawlLURL extends indexURL { urlIndexFile.put(entry.toRowEntry(), entry.loaddate()); } - public synchronized plasmaCrawlLURLEntry newEntry(String propStr, boolean setGlobal) { + public synchronized plasmaCrawlLURLEntry newEntry(String propStr) { if (propStr.startsWith("{") && propStr.endsWith("}")) { - return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal); + if (newdb) + return new plasmaCrawlLURLNewEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); + else + return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); } else { return null; } } - public synchronized plasmaCrawlLURLEntry newEntry(String url, String descr, Date moddate, Date loaddate, - String referrerHash, int copyCount, boolean localNeed, - int quality, String language, char doctype, - int size, int wordCount) { - plasmaCrawlLURLEntry e = new plasmaCrawlLURLOldEntry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount); - return e; + public synchronized plasmaCrawlLURLEntry newEntry( + URL url, + String descr, + String author, + String tags, + String ETag, + Date mod, + Date load, + Date fresh, + String referrer, + byte[] md5, + long size, + int wc, + char dt, + bitfield flags, + String lang, + int llocal, + int lother, + int laudio, + int limage, + int lvideo, + int lapp) { + if (newdb) + return new plasmaCrawlLURLNewEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5, + size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp); + else + return new plasmaCrawlLURLOldEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5, + size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp); } - public int getStackSize(int stack) { + public synchronized int getStackSize(int stack) { switch (stack) { case 1: return externResultStack.size(); case 2: return searchResultStack.size(); @@ -215,7 +254,7 @@ public final class plasmaCrawlLURL extends indexURL { return -1; } - public String getUrlHash(int stack, int pos) { + public synchronized String getUrlHash(int stack, int pos) { switch (stack) { case 1: return ((String) externResultStack.get(pos)).substring(0, urlHashLength); case 2: return ((String) searchResultStack.get(pos)).substring(0, urlHashLength); @@ -227,7 +266,7 @@ public final class plasmaCrawlLURL extends indexURL { return null; } - public String getInitiatorHash(int stack, int pos) { + public synchronized String getInitiatorHash(int stack, int pos) { switch (stack) { case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); @@ -239,7 +278,7 @@ public final class plasmaCrawlLURL extends indexURL { return null; } - public String getExecutorHash(int stack, int pos) { + public synchronized String getExecutorHash(int stack, int pos) { switch (stack) { case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); @@ -251,7 +290,7 @@ public final class plasmaCrawlLURL extends indexURL { return null; } - public boolean removeStack(int stack, int pos) { + public synchronized boolean removeStack(int stack, int pos) { Object prevElement = null; switch (stack) { case 1: prevElement = externResultStack.remove(pos); break; @@ -264,7 +303,7 @@ public final class plasmaCrawlLURL extends indexURL { return prevElement != null; } - public void clearStack(int stack) { + public synchronized void clearStack(int stack) { switch (stack) { case 1: externResultStack.clear(); break; case 2: searchResultStack.clear(); break; @@ -275,29 +314,31 @@ public final class plasmaCrawlLURL extends indexURL { } } - public boolean remove(String urlHash) { - if (!super.remove(urlHash)) return false; - for (int stack = 1; stack <= 6; stack++) { - for (int i = getStackSize(stack) - 1; i >= 0; i--) { - if (getUrlHash(stack,i).equals(urlHash)) { - removeStack(stack,i); - return true; + public synchronized boolean remove(String urlHash) { + if (urlHash == null) return false; + try { + kelondroRow.Entry r = urlIndexFile.remove(urlHash.getBytes()); + if (r == null) return false; + for (int stack = 1; stack <= 6; stack++) { + for (int i = getStackSize(stack) - 1; i >= 0; i--) { + if (getUrlHash(stack, i).equals(urlHash)) { + removeStack(stack, i); + return true; + } } } + return true; + } catch (IOException e) { + return false; } - return false; } - public boolean exists(String urlHash) { - try { - if (urlIndexFile.get(urlHash.getBytes()) != null) { - return true; - } else { - return false; - } - } catch (IOException e) { - return false; - } + public synchronized boolean exists(String urlHash) { + try { + return (urlIndexFile.get(urlHash.getBytes()) != null); + } catch (IOException e) { + return false; + } } private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); @@ -402,7 +443,10 @@ public final class plasmaCrawlLURL extends indexURL { kelondroRow.Entry e = (kelondroRow.Entry) i.next(); if (e == null) return null; try { - return new plasmaCrawlLURLOldEntry(e, null); + if (newdb) + return new plasmaCrawlLURLNewEntry(e, null); + else + return new plasmaCrawlLURLOldEntry(e, null); } catch (IOException ex) { throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); } @@ -602,7 +646,7 @@ public final class plasmaCrawlLURL extends indexURL { } catch (MalformedURLException e) {} if (args[0].equals("-l")) try { // arg 1 is path to URLCache - final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false); + final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0, false); final Iterator enu = urls.entries(true, false, null); while (enu.hasNext()) { System.out.println(((plasmaCrawlLURLEntry) enu.next()).toString()); diff --git a/source/de/anomic/plasma/plasmaCrawlLURLEntry.java b/source/de/anomic/plasma/plasmaCrawlLURLEntry.java index 18c859a6b..fd079efb3 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURLEntry.java +++ b/source/de/anomic/plasma/plasmaCrawlLURLEntry.java @@ -37,33 +37,20 @@ import de.anomic.index.indexEntry; public interface plasmaCrawlLURLEntry { public kelondroRow.Entry toRowEntry() throws IOException; - public String hash(); - public Components comp(); - public Date moddate(); - public Date loaddate(); - + public Date freshdate(); public String referrerHash(); - public char doctype(); - public String language(); - public int size(); - public int wordCount(); - public String snippet(); - public indexEntry word(); - public boolean isOlder(plasmaCrawlLURLEntry other); - public String toString(String snippet); - public String toString(); public class Components { @@ -81,6 +68,13 @@ public interface plasmaCrawlLURLEntry { this.tags = tags; this.ETag = ETag; } + public Components(URL url, String descr, String author, String tags, String ETag) { + this.url = url; + this.descr = descr; + this.author = author; + this.tags = tags; + this.ETag = ETag; + } public URL url() { return this.url; } public String descr() { return this.descr; } public String author() { return this.author; } diff --git a/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java b/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java index bd00fe8d2..f1e384868 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java +++ b/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java @@ -14,7 +14,7 @@ import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroRow; import de.anomic.net.URL; -import de.anomic.server.serverByteBuffer; +import de.anomic.server.serverCharBuffer; import de.anomic.server.serverCodings; import de.anomic.tools.crypt; import de.anomic.tools.bitfield; @@ -27,8 +27,9 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { "String comp-360, " + // components: the url, description, author and tags. As 5th element, an ETag is possible "Cardinal mod-4 {b256}, " + // last-modified from the httpd "Cardinal load-4 {b256}, " + // time when the url was loaded + "Cardinal fresh-4 {b256}, " + // time until this url is fresh "String referrer-12, " + // (one of) the url's referrer hash(es) - "byte[] md5-8" + // the md5 of the url content (to identify changes) + "byte[] md5-8, " + // the md5 of the url content (to identify changes) "Cardinal size-6 {b256}, " + // size of file in bytes "Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds "byte[] dt-1, " + // doctype, taken from extension or any other heuristic @@ -53,11 +54,12 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { String ETag, Date mod, Date load, + Date fresh, String referrer, byte[] md5, long size, int wc, - byte dt, + char dt, bitfield flags, String lang, int llocal, @@ -72,6 +74,7 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { this.entry.setCol("comp", encodeComp(url, descr, author, tags, ETag)); this.entry.setCol("mod", encodeDate(mod)); this.entry.setCol("load", encodeDate(load)); + this.entry.setCol("fresh", encodeDate(fresh)); this.entry.setCol("referrer", referrer.getBytes()); this.entry.setCol("md5", md5); this.entry.setCol("size", size); @@ -89,17 +92,18 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { this.word = null; } - byte[] encodeDate(Date d) { + public static byte[] encodeDate(Date d) { return kelondroNaturalOrder.encodeLong(d.getTime() / 86400000, 4); } - byte[] encodeComp(URL url, String descr, String author, String tags, String ETag) { - serverByteBuffer s = new serverByteBuffer(200); + public static byte[] encodeComp(URL url, String descr, String author, String tags, String ETag) { + serverCharBuffer s = new serverCharBuffer(200); s.append(url.toNormalform()).append((char) 10); + s.append(descr).append((char) 10); s.append(author).append((char) 10); s.append(tags).append((char) 10); s.append(ETag).append((char) 10); - return s.getBytes(); + return s.toString().getBytes(); } public plasmaCrawlLURLNewEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException { @@ -108,7 +112,7 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { this.word = searchedWord; } - public plasmaCrawlLURLNewEntry(Properties prop, boolean setGlobal) throws IOException { + public plasmaCrawlLURLNewEntry(Properties prop){ // generates an plasmaLURLEntry using the properties from the argument // the property names must correspond to the one from toString //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); @@ -116,7 +120,7 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { try { url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null)); } catch (MalformedURLException e) { - throw new IOException("URL is not proper: " + crypt.simpleDecode(prop.getProperty("url", ""), null)); + url = null; } String descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (descr == null) descr = ""; String author = crypt.simpleDecode(prop.getProperty("author", ""), null); if (author == null) author = ""; @@ -136,8 +140,13 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { } catch (ParseException e) { this.entry.setCol("load", encodeDate(new Date())); } + try { + this.entry.setCol("fresh", encodeDate(indexURL.shortDayFormatter.parse(prop.getProperty("fresh", "20000101")))); + } catch (ParseException e) { + this.entry.setCol("fresh", encodeDate(new Date())); + } this.entry.setCol("referrer", prop.getProperty("referrer", indexURL.dummyHash).getBytes()); - this.entry.setCol("md5", serverCodings.decodeHex(prop.getProperty("md5", indexURL.dummyHash))); + this.entry.setCol("md5", serverCodings.decodeHex(prop.getProperty("md5", ""))); this.entry.setCol("size", Integer.parseInt(prop.getProperty("size", "0"))); this.entry.setCol("wc", Integer.parseInt(prop.getProperty("wc", "0"))); this.entry.setCol("dt", prop.getProperty("dt", "t").charAt(0)); @@ -166,6 +175,7 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { s.append(",ETag=").append(crypt.simpleEncode(comp.ETag())); s.append(",mod=").append(indexURL.shortDayFormatter.format(moddate())); s.append(",load=").append(indexURL.shortDayFormatter.format(loaddate())); + s.append(",fresh=").append(indexURL.shortDayFormatter.format(freshdate())); s.append(",referrer=").append(referrerHash()); s.append(",md5=").append(md5()); s.append(",size=").append(size()); @@ -207,7 +217,7 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { return this.entry.getColString("hash", "", null); } - public de.anomic.plasma.plasmaCrawlLURLEntry.Components comp() { + public plasmaCrawlLURLEntry.Components comp() { ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8"); return new de.anomic.plasma.plasmaCrawlLURLEntry.Components( (cl.size() > 0) ? (String) cl.get(0) : "", @@ -225,6 +235,10 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { return new Date(86400000 * entry.getColLong("load", 0)); } + public Date freshdate() { + return new Date(86400000 * entry.getColLong("fresh", 0)); + } + public String referrerHash() { // return the creator's hash return entry.getColString("referrer", indexURL.dummyHash, null); diff --git a/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java b/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java index 570711e98..6755c6861 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java +++ b/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java @@ -36,7 +36,9 @@ import de.anomic.index.indexURL; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroRow; +import de.anomic.net.URL; import de.anomic.server.logging.serverLog; +import de.anomic.tools.bitfield; import de.anomic.tools.crypt; public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { @@ -56,7 +58,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { "Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes "Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count - private String url; + private URL url; private String descr; private Date moddate; private Date loaddate; @@ -72,24 +74,42 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { private String snippet; private indexEntry word; // this is only used if the url is transported via remote search requests - public plasmaCrawlLURLOldEntry(String url, String descr, Date moddate, - Date loaddate, String referrerHash, int copyCount, - boolean localNeed, int quality, String language, char doctype, - int size, int wordCount) { + public plasmaCrawlLURLOldEntry( + URL url, + String descr, + String author, + String tags, + String ETag, + Date mod, + Date load, + Date fresh, + String referrer, + byte[] md5, + long size, + int wc, + char dt, + bitfield flags, + String lang, + int llocal, + int lother, + int laudio, + int limage, + int lvideo, + int lapp) { // create new entry and store it into database this.urlHash = indexURL.urlHash(url); this.url = url; this.descr = (descr == null) ? this.url.toString() : descr; - this.moddate = moddate; - this.loaddate = loaddate; + this.moddate = mod; + this.loaddate = load; this.referrerHash = (referrerHash == null) ? indexURL.dummyHash : referrerHash; - this.copyCount = copyCount; // the number of remote (global) copies of this object without this one - this.flags = (localNeed) ? "L " : " "; - this.quality = quality; + this.copyCount = 0; // the number of remote (global) copies of this object without this one + this.flags = " "; + this.quality = 0; this.language = (language == null) ? "uk" : language; - this.doctype = doctype; - this.size = size; - this.wordCount = wordCount; + this.doctype = dt; + this.size = (int) size; + this.wordCount = wc; this.snippet = null; this.word = null; } @@ -97,7 +117,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException { try { this.urlHash = entry.getColString(0, null); - this.url = entry.getColString(1, "UTF-8").trim(); + this.url = new URL(entry.getColString(1, "UTF-8").trim()); this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim(); this.moddate = new Date(86400000 * entry.getColLong(3)); this.loaddate = new Date(86400000 * entry.getColLong(4)); @@ -118,7 +138,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { } } - public plasmaCrawlLURLOldEntry(Properties prop, boolean setGlobal) { + public plasmaCrawlLURLOldEntry(Properties prop) { // generates an plasmaLURLEntry using the properties from the argument // the property names must correspond to the one from toString //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); @@ -130,8 +150,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { this.loaddate = indexURL.shortDayFormatter.parse(prop.getProperty("load", "20000101")); this.copyCount = Integer.parseInt(prop.getProperty("cc", "0")); this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " "); - if (setGlobal) this.flags = "G "; - this.url = crypt.simpleDecode(prop.getProperty("url", ""), null); + this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null)); this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (this.descr == null) this.descr = this.url.toString(); this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", "")); @@ -154,6 +173,10 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { } } + public static kelondroRow rowdef() { + return rowdef; + } + public kelondroRow.Entry toRowEntry() throws IOException { final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexURL.urlDateLength); final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexURL.urlDateLength); @@ -195,6 +218,10 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { return loaddate; } + public Date freshdate() { + return loaddate; + } + public String referrerHash() { // return the creator's hash return referrerHash; diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 1920f6fc5..9f13f8af0 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -239,6 +239,16 @@ public class plasmaCrawlNURL extends indexURL { } } + public boolean remove(String hash) { + if (hash == null) return false; + try { + urlIndexFile.remove(hash.getBytes()); + return true; + } catch (IOException e) { + return false; + } + } + private static String normalizeHandle(int h) { String d = Integer.toHexString(h); while (d.length() < urlHandleLength) d = "0" + d; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 16ac55d03..690cde2e6 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -199,7 +199,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // storage management public File htCachePath; private File plasmaPath; - public File indexPublicTextPath; + public File indexPath; public File listsPath; public File htDocsPath; public File rankingPath; @@ -279,8 +279,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // load values from configs this.plasmaPath = new File(rootPath, getConfig("dbPath", "DATA/PLASMADB")); this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString()); - this.indexPublicTextPath = new File(rootPath, getConfig("indexPublicTextPath", "DATA/INDEX/PUBLIC/TEXT")); - this.log.logConfig("Index Path: " + this.indexPublicTextPath.toString()); + this.indexPath = new File(rootPath, getConfig("indexPath", "DATA/INDEX")); + this.log.logConfig("Index Path: " + this.indexPath.toString()); this.listsPath = new File(rootPath, getConfig("listsPath", "DATA/LISTS")); this.log.logConfig("Lists Path: " + this.listsPath.toString()); this.htDocsPath = new File(rootPath, getConfig("htDocsPath", "DATA/HTDOCS")); @@ -418,12 +418,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // start indexing management log.logConfig("Starting Indexing Management"); - urlPool = new plasmaURLPool(plasmaPath, + urlPool = new plasmaURLPool(plasmaPath, indexPath, ramLURL, getConfigBool("useFlexTableForLURL", false), ramNURL, getConfigBool("useFlexTableForNURL", false), ramEURL, getConfigBool("useFlexTableForEURL", true), ramLURL_time); - wordIndex = new plasmaWordIndex(plasmaPath, indexPublicTextPath, ramRWI, ramRWI_time, log, getConfigBool("useCollectionIndex", false)); + wordIndex = new plasmaWordIndex(plasmaPath, indexPath, true, ramRWI, ramRWI_time, log, getConfigBool("useCollectionIndex", false)); // set a high maximum cache size to current size; this is adopted later automatically int wordCacheMaxCount = Math.max((int) getConfigLong("wordCacheInitCount", 30000), @@ -1559,20 +1559,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // create a new loaded URL db entry plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry( - entry.url().toNormalform(), // URL - docDescription, // document description - docDate, // modification date - new Date(), // loaded date - referrerUrlHash, // referer hash - 0, // copy count - true, // local need - condenser.RESULT_WORD_ENTROPHY, // quality - indexEntryAttribute.language(entry.url()), // language - indexEntryAttribute.docType(document.getMimeType()), // doctype - (int) entry.size(), // size - condenser.RESULT_NUMB_WORDS // word count + entry.url(), // URL + docDescription, // document description + "", // author + "", // tags + "", // ETag + docDate, // modification date + new Date(), // loaded date + new Date(), // freshdate + referrerUrlHash, // referer hash + new byte[0], // md5 + (int) entry.size(), // size + condenser.RESULT_NUMB_WORDS, // word count + indexEntryAttribute.docType(document.getMimeType()), // doctype + new bitfield(4), // flags + indexEntryAttribute.language(entry.url()), // language + 0,0,0,0,0,0 ); - /* ======================================================================== * STORE URL TO LOADED-URL-DB * ======================================================================== */ @@ -1968,7 +1971,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String lurl = (String) page.get("lurl"); if ((lurl != null) && (lurl.length() != 0)) { String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); - plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr, true); + plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr); urlPool.loadedURL.store(entry); urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? urlPool.noticeURL.remove(entry.hash()); diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java index e02af682f..f861d748a 100644 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -57,12 +57,12 @@ public class plasmaURLPool { public final plasmaCrawlNURL noticeURL; public final plasmaCrawlEURL errorURL; - public plasmaURLPool(File plasmaPath, + public plasmaURLPool(File plasmaPath, File indexPath, int ramLURL, boolean newLURL, int ramNURL, boolean newNURL, int ramEURL, boolean newEURL, long preloadTime) { - loadedURL = new plasmaCrawlLURL(plasmaPath, ramLURL, preloadTime, newLURL); + loadedURL = new plasmaCrawlLURL(plasmaPath, indexPath, ramLURL, preloadTime, newLURL); noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1, newNURL); errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1, newEURL); } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index f6150c8fc..7154942a3 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -72,7 +72,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public boolean useCollectionIndex; // flag for usage of new collectionIndex db private int idleDivisor, busyDivisor; - public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, int bufferkb, long preloadTime, serverLog log, boolean useCollectionIndex) { + public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, boolean dummy, int bufferkb, long preloadTime, serverLog log, boolean useCollectionIndex) { this.oldDatabaseRoot = oldDatabaseRoot; this.backend = new plasmaWordIndexFileCluster(oldDatabaseRoot, log); this.dhtOutCache = new indexRAMCacheRI(oldDatabaseRoot, (useCollectionIndex) ? 1024 : 64, "indexDump1.array", log); @@ -83,9 +83,10 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { this.assortmentBufferSize = bufferkb; // create collections storage path - if (!(newIndexRoot.exists())) newIndexRoot.mkdirs(); + File textindexpath = new File(newIndexRoot, "PUBLIC/TEXT"); + if (!(textindexpath.exists())) textindexpath.mkdirs(); if (useCollectionIndex) { - this.collections = new indexCollectionRI(newIndexRoot, "test_generation1", bufferkb * 1024, preloadTime); + this.collections = new indexCollectionRI(textindexpath, "test_generation1", bufferkb * 1024, preloadTime); if (assortmentClusterPath.exists()) this.assortmentCluster = new plasmaWordIndexAssortmentCluster(assortmentClusterPath, assortmentCount, assortmentBufferSize, preloadTime, log); else @@ -855,8 +856,8 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { // System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y")); // System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis())))); File plasmadb = new File("D:\\dev\\proxy\\DATA\\PLASMADB"); - File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX\\PRIVATE\\TEXT"); - plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, 555, 1000, new serverLog("TESTAPP"), false); + File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX"); + plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, true, 555, 1000, new serverLog("TESTAPP"), false); try { Iterator containerIter = index.wordContainers("5A8yhZMh_Kmv", plasmaWordIndex.RL_WORDFILES, true); while (containerIter.hasNext()) { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 5d400530b..19708774a 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -501,7 +501,7 @@ public final class yacyClient { String[] urls = new String[results]; for (int n = 0; n < results; n++) { // get one single search result - urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); + urlEntry = urlManager.newEntry((String) result.get("resource" + n)); if (urlEntry == null) continue; plasmaCrawlLURLEntry.Components comp = urlEntry.comp(); if (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, comp.url())) continue; // block with backlist diff --git a/source/yacy.java b/source/yacy.java index 01a4d055e..90f545f1b 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -75,10 +75,10 @@ import de.anomic.index.indexEntry; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroDyn; -import de.anomic.kelondro.kelondroFlexSplitTable; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMap; -import de.anomic.kelondro.kelondroNaturalOrder; +import de.anomic.kelondro.kelondroRow; +import de.anomic.kelondro.kelondroTree; import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlLURL; @@ -98,6 +98,7 @@ import de.anomic.server.serverPlainSwitch; import de.anomic.server.serverSwitch; import de.anomic.server.serverSystem; import de.anomic.server.logging.serverLog; +import de.anomic.tools.bitfield; import de.anomic.tools.enumerateFiles; import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacyCore; @@ -651,11 +652,11 @@ public final class yacy { final serverSwitch sps = new serverPlainSwitch(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf"); try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} File dbroot = new File(new File(homePath), "DATA/PLASMADB"); - File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT"); + File indexRoot = new File(new File(homePath), "DATA/INDEX"); serverLog log = new serverLog("WORDMIGRATION"); log.logInfo("STARTING MIGRATION"); boolean useCollectionIndex = sps.getConfigBool("useCollectionIndex", false); - plasmaWordIndex wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, 20000, 10000, log, useCollectionIndex); + plasmaWordIndex wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, true, 20000, 10000, log, useCollectionIndex); enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true); String wordhash; File wordfile; @@ -696,8 +697,8 @@ public final class yacy { // run with "java -classpath classes yacy -minimizeUrlDB" final serverSwitch sps = new serverPlainSwitch(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf"); try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} - File dbroot = new File(new File(homePath), "DATA/PLASMADB"); - File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT"); + File plasmaroot = new File(new File(homePath), "DATA/PLASMADB"); + File indexRoot = new File(new File(homePath), "DATA/INDEX"); serverLog log = new serverLog("URL-CLEANUP"); try { log.logInfo("STARTING URL CLEANUP"); @@ -705,16 +706,16 @@ public final class yacy { // db containing all currently loades urls int cache = dbcache * 1024; // in KB log.logFine("URLDB-Caches: "+cache+" bytes"); - plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(dbroot, cache, 10000, false); + plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexRoot, cache, 10000, false); // db used to hold all neede urls - plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(new File(dbroot, "minimized"), cache, 10000, false); + plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(new File(plasmaroot, "minimized"), indexRoot, cache, 10000, false); Runtime rt = Runtime.getRuntime(); int cacheMem = (int)((serverMemory.max-rt.totalMemory())/1024)-(2*cache + 8*1024); if (cacheMem < 2048) throw new OutOfMemoryError("Not enough memory available to start clean up."); - plasmaWordIndex wordIndex = new plasmaWordIndex(dbroot, indexRoot, cacheMem, 10000, log, sps.getConfigBool("useCollectionIndex", false)); + plasmaWordIndex wordIndex = new plasmaWordIndex(plasmaroot, indexRoot, true, cacheMem, 10000, log, sps.getConfigBool("useCollectionIndex", false)); Iterator indexContainerIterator = wordIndex.wordContainers("------------", plasmaWordIndex.RL_WORDFILES, false); long urlCounter = 0, wordCounter = 0; @@ -944,7 +945,7 @@ public final class yacy { File root = new File(homePath); try { - plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, false, 1000, false, 1000, false, 10000); + plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, false, 1000, false, 1000, false, 10000); HashMap doms = new HashMap(); System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries."); System.out.println("a dump will be written after double-check of all extracted domains."); @@ -1060,7 +1061,7 @@ public final class yacy { private static void urllist(String homePath, String source, boolean html, String targetName) { File root = new File(homePath); try { - plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, false, 1000, false, 1000, false, 10000); + plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, false, 1000, false, 1000, false, 10000); File file = new File(root, targetName); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file)); @@ -1120,23 +1121,104 @@ public final class yacy { } } + /* private static void migratelurls(String homePath) { File root = new File(homePath); try { - plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, false, 1000, false, 1000, false, 10000); - kelondroFlexSplitTable fsp = new kelondroFlexSplitTable(new File(root, "DATA//INDEX/PUBLIC/TEXT"), "urls", 1000, -1, plasmaCrawlLURLOldEntry.rowdef, kelondroNaturalOrder.naturalOrder); - + plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, false, 1000, false, 1000, false, 10000); + kelondroFlexSplitTable fsp = new kelondroFlexSplitTable(new File(root, "DATA/INDEX/PUBLIC/TEXT"), "urls", 1000, -1, plasmaCrawlLURLNewEntry.rowdef, kelondroBase64Order.enhancedCoder); + + long start = System.currentTimeMillis(); + long last = start; + int tc = pool.loadedURL.size(), c = 0; Iterator eiter = pool.loadedURL.entries(true, false, null); - plasmaCrawlLURLEntry entry; + plasmaCrawlLURLEntry oldentry; + kelondroRow.Entry newentry; while (eiter.hasNext()) { - entry = (plasmaCrawlLURLEntry) eiter.next(); - plasmaCrawlLURLEntry.Components comp = entry.comp(); - if ((entry != null) && (comp.url() != null)) { - fsp.put(entry.toRowEntry(), entry.loaddate()); + oldentry = (plasmaCrawlLURLEntry) eiter.next(); + if (oldentry != null) { + plasmaCrawlLURLEntry.Components comp = oldentry.comp(); + newentry = plasmaCrawlLURLNewEntry.rowdef.newEntry(); + newentry.setCol("hash", indexURL.urlHash(comp.url()), null); + newentry.setCol("comp", plasmaCrawlLURLNewEntry.encodeComp(comp.url(), comp.descr(), "", "", "")); + newentry.setCol("mod", plasmaCrawlLURLNewEntry.encodeDate(oldentry.moddate())); + newentry.setCol("load", plasmaCrawlLURLNewEntry.encodeDate(oldentry.loaddate())); + newentry.setCol("referrer", oldentry.referrerHash().getBytes()); + newentry.setCol("md5", new byte[0]); + newentry.setCol("size", oldentry.size()); + newentry.setCol("wc", oldentry.wordCount()); + newentry.setCol("dt", oldentry.doctype()); + newentry.setCol("flags", new bitfield(4).getBytes()); + newentry.setCol("lang", oldentry.language().getBytes()); + newentry.setCol("llocal", 0); + newentry.setCol("lother", 0); + newentry.setCol("limage", 0); + newentry.setCol("laudio", 0); + newentry.setCol("lvideo", 0); + newentry.setCol("lapp", 0); + fsp.put(newentry, oldentry.loaddate()); + } + c++; + if (System.currentTimeMillis() - last > 60000) { + System.out.println("Migrated " + c + " from " + tc + " urls. Estimated remaining time: " + ((System.currentTimeMillis() - start) * (tc - c) / c / 60000) + " minutes"); + last = System.currentTimeMillis(); } } + System.out.println("MIGRATION OF " + c + " URLs FINISHED"); + pool.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + */ + + private static void migratelurls(File root, File urlHash) { + try { + plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, true, 1000, true, 1000, true, 10000); + kelondroTree oldindex = new kelondroTree(urlHash, 1000, -1, kelondroTree.defaultObjectCachePercent, plasmaCrawlLURLOldEntry.rowdef); + long start = System.currentTimeMillis(); + long last = start; + int tc = oldindex.size(), c = 0; + Iterator eiter = oldindex.contentRows(-1); + kelondroRow.Entry oldrow; + plasmaCrawlLURLEntry oldentry; + plasmaCrawlLURLEntry newentry; + plasmaCrawlLURLEntry.Components comp; + byte[] dummymd5 = new byte[0]; + while (eiter.hasNext()) { + oldrow = (kelondroRow.Entry) eiter.next(); + if (oldrow != null) { + oldentry = new plasmaCrawlLURLOldEntry(oldrow, null); + comp = oldentry.comp(); + newentry = pool.loadedURL.newEntry( + comp.url(), + comp.descr(), + "", + "", + "", + oldentry.moddate(), + oldentry.loaddate(), + oldentry.freshdate(), + oldentry.referrerHash(), + dummymd5, + oldentry.size(), + oldentry.wordCount(), + oldentry.doctype(), + new bitfield(4), + oldentry.language(), + 0, 0, 0, 0, 0, 0); + pool.loadedURL.store(newentry); + } + c++; + if (System.currentTimeMillis() - last > 60000) { + System.out.println("Migrated " + c + " from " + tc + " urls. Estimated remaining time: " + ((System.currentTimeMillis() - start) * (tc - c) * Math.sqrt(Math.sqrt(tc - c)) / c / 60000) + " minutes"); + last = System.currentTimeMillis(); + } + } pool.close(); + oldindex.close(); + System.out.println("MIGRATION OF " + c + " URLs FINISHED"); } catch (IOException e) { e.printStackTrace(); } @@ -1157,11 +1239,12 @@ public final class yacy { */ private static void urldbcleanup(String homePath) { File root = new File(homePath); - File dbroot = new File(root, "DATA/PLASMADB"); + File plasmaroot = new File(root, "DATA/PLASMADB"); + File indexroot = new File(root, "DATA/INDEX"); serverLog log = new serverLog("URLDBCLEANUP"); try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} try { - plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(dbroot, 4194304, 10000, false); + plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexroot, 4194304, 10000, false); currentUrlDB.urldbcleanup(); currentUrlDB.close(); } catch (IOException e) { @@ -1174,7 +1257,7 @@ public final class yacy { serverLog log = new serverLog("HASHLIST"); final serverSwitch sps = new serverPlainSwitch(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf"); File homeDBroot = new File(new File(homePath), "DATA/PLASMADB"); - File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT"); + File indexRoot = new File(new File(homePath), "DATA/INDEX"); String wordChunkStartHash = "------------"; try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} log.logInfo("STARTING CREATION OF RWI-HASHLIST"); @@ -1182,7 +1265,7 @@ public final class yacy { try { Iterator indexContainerIterator = null; if (resource.equals("all")) { - WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, 8*1024*1024, 3000, log, sps.getConfigBool("useCollectionIndex", false)); + WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log, sps.getConfigBool("useCollectionIndex", false)); indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); } else if (resource.equals("assortments")) { plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, 16*1024*1024, 3000, log); @@ -1394,7 +1477,8 @@ public final class yacy { String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt"); urllist(applicationRoot, source, html, outfile); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratelurls"))) { - migratelurls(applicationRoot); + File root = new File(applicationRoot); + migratelurls(root, new File(root, "DATA/PLASMADB/urlHash.db")); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) { // generate a url list and save it in a file if (args.length == 2) applicationRoot= args[1]; diff --git a/yacy.init b/yacy.init index 99b448e58..6927f7fa5 100644 --- a/yacy.init +++ b/yacy.init @@ -201,7 +201,7 @@ promoteSearchPageGreeting = dbPath=DATA/PLASMADB # the path to the public reverse word index for text files (web pages) -indexPublicTextPath=DATA/INDEX/PUBLIC/TEXT +indexPath=DATA/INDEX # the path to the LISTS files. Most lists are used to filter web content listsPath=DATA/LISTS @@ -819,6 +819,7 @@ currentSkin= useCollectionIndex=false useFlexTableForNURL=false useFlexTableForEURL=true +useFlexTableForLURL=false tableTypeForPreNURL=2 # flag to show surftipps on index.html page