fix for bug in -incell option of URLAnalysis

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5967 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent a7e392f31b
commit e005cfea37

@ -401,20 +401,20 @@ public class URLAnalysis {
plasmaWordIndex.wordReferenceFactory, plasmaWordIndex.wordReferenceFactory,
Base64Order.enhancedCoder, Base64Order.enhancedCoder,
WordReferenceRow.urlEntryRow); WordReferenceRow.urlEntryRow);
System.out.println("COLLECTION INDEX REFERENCE COLLECTION starting dump of statistics"); System.out.println("INDEX REFERENCE COLLECTION starting dump of statistics");
idx.dump(new File(statisticPath)); idx.dump(new File(statisticPath));
System.out.println("COLLECTION INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath); System.out.println("INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
} }
public static int diffurlcol(String metadataPath, String statisticFile, String diffFile) throws IOException { public static int diffurlcol(String metadataPath, String statisticFile, String diffFile) throws IOException {
System.out.println("COLLECTION INDEX DIFF URL-COL startup"); System.out.println("INDEX DIFF URL-COL startup");
IntegerHandleIndex idx = new IntegerHandleIndex(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(statisticFile), 0); IntegerHandleIndex idx = new IntegerHandleIndex(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(statisticFile), 0);
MetadataRepository mr = new MetadataRepository(new File(metadataPath)); MetadataRepository mr = new MetadataRepository(new File(metadataPath));
HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 0, 1000000); HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 0, 1000000);
System.out.println("COLLECTION INDEX DIFF URL-COL loaded dump, starting diff"); System.out.println("INDEX DIFF URL-COL loaded dump, starting diff");
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
long update = start - 7000; long update = start - 7000;
int c = 0; int c = 0;
@ -425,14 +425,14 @@ public class URLAnalysis {
} }
c++; c++;
if (System.currentTimeMillis() - update > 10000) { if (System.currentTimeMillis() - update > 10000) {
System.out.println("COLLECTION INDEX DIFF URL-COL running, checked " + c + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - c) / c) / 60000) + " minutes remaining"); System.out.println("INDEX DIFF URL-COL running, checked " + c + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - c) / c) / 60000) + " minutes remaining");
update = System.currentTimeMillis(); update = System.currentTimeMillis();
} }
} }
mr.close(); mr.close();
System.out.println("COLLECTION INDEX DIFF URL-COL finished diff, starting dump to " + diffFile); System.out.println("INDEX DIFF URL-COL finished diff, starting dump to " + diffFile);
c = hs.dump(new File(diffFile)); c = hs.dump(new File(diffFile));
System.out.println("COLLECTION INDEX DIFF URL-COL finished dump, wrote " + c + " references that occur in the URL-DB, but not in the collection-dump"); System.out.println("INDEX DIFF URL-COL finished dump, wrote " + c + " references that occur in the URL-DB, but not in the collection-dump");
return c; return c;
} }

@ -294,7 +294,7 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
IntegerHandleIndex references = new IntegerHandleIndex(payloadrow.primaryKeyLength, termOrder, 0, 1000000); IntegerHandleIndex references = new IntegerHandleIndex(payloadrow.primaryKeyLength, termOrder, 0, 1000000);
String[] files = heapLocation.list(); String[] files = heapLocation.list();
for (String f: files) { for (String f: files) {
if (f.length() < 22 && !f.startsWith("index") && !f.endsWith(".blob")) continue; if (f.length() < 22 || !f.startsWith("index") || !f.endsWith(".blob")) continue;
File fl = new File(heapLocation, f); File fl = new File(heapLocation, f);
System.out.println("CELL REFERENCE COLLECTION opening blob " + fl); System.out.println("CELL REFERENCE COLLECTION opening blob " + fl);
CloneableIterator<ReferenceContainer<ReferenceType>> ei = new ReferenceContainerCache.blobFileEntries<ReferenceType>(fl, factory, payloadrow); CloneableIterator<ReferenceContainer<ReferenceType>> ei = new ReferenceContainerCache.blobFileEntries<ReferenceType>(fl, factory, payloadrow);

Loading…
Cancel
Save