some refactoring of search methods

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5988 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent d793bb0d76
commit fec6f9054f

@ -80,7 +80,7 @@ public final class timeline {
//yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links"); //yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links");
// get the index container with the result vector // get the index container with the result vector
HashMap<byte[], ReferenceContainer<WordReference>>[] localSearchContainerMaps = sb.webIndex.localSearchContainers(q, Word.words2hashes(query[1]), null); HashMap<byte[], ReferenceContainer<WordReference>>[] localSearchContainerMaps = sb.webIndex.index().searchTerm(q, Word.words2hashes(query[1]), null);
final ReferenceContainer<WordReference> index = final ReferenceContainer<WordReference> index =
ReferenceContainer.joinExcludeContainers( ReferenceContainer.joinExcludeContainers(
plasmaWordIndex.wordReferenceFactory, plasmaWordIndex.wordReferenceFactory,

@ -209,7 +209,7 @@ public final class search {
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
final long timer = System.currentTimeMillis(); final long timer = System.currentTimeMillis();
final Map<byte[], ReferenceContainer<WordReference>>[] containers = sb.webIndex.localSearchContainers(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2StringSet(urls)); final Map<byte[], ReferenceContainer<WordReference>>[] containers = sb.webIndex.index().searchTerm(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2StringSet(urls));
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.COLLECTION, containers[0].size(), System.currentTimeMillis() - timer), false); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.COLLECTION, containers[0].size(), System.currentTimeMillis() - timer), false);
if (containers != null) { if (containers != null) {

@ -144,13 +144,11 @@ public class PhpBB3Dao implements Dao {
} }
public int size() { public int size() {
StringBuilder sql = new StringBuilder(256);
sql.append("select count(*) from phpbb_posts");
Statement stmt = null; Statement stmt = null;
ResultSet rs = null; ResultSet rs = null;
try { try {
stmt = conn.createStatement(); stmt = conn.createStatement();
rs = stmt.executeQuery(sql.toString()); rs = stmt.executeQuery("select count(*) from phpbb_posts");
if (rs.next()) { if (rs.next()) {
return rs.getInt(1); return rs.getInt(1);
} }

@ -44,8 +44,6 @@ import de.anomic.content.DCEntry;
public class SurrogateReader extends DefaultHandler implements Runnable { public class SurrogateReader extends DefaultHandler implements Runnable {
public static final DCEntry poison = new DCEntry();
// class variables // class variables
private final StringBuilder buffer; private final StringBuilder buffer;
private boolean parsingValue; private boolean parsingValue;
@ -83,7 +81,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
e.printStackTrace(); e.printStackTrace();
} finally { } finally {
try { try {
this.surrogates.put(poison); this.surrogates.put(DCEntry.poison);
} catch (InterruptedException e1) { } catch (InterruptedException e1) {
e1.printStackTrace(); e1.printStackTrace();
} }
@ -170,7 +168,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
t.start(); t.start();
DCEntry s; DCEntry s;
System.out.println("1"); System.out.println("1");
while ((s = sr.take()) != SurrogateReader.poison) { while ((s = sr.take()) != DCEntry.poison) {
System.out.println("Title: " + s.title()); System.out.println("Title: " + s.title());
System.out.println("Date: " + s.date()); System.out.println("Date: " + s.date());
System.out.println("URL: " + s.url()); System.out.println("URL: " + s.url());

@ -28,6 +28,7 @@
package de.anomic.kelondro.text; package de.anomic.kelondro.text;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
@ -85,4 +86,69 @@ public abstract class AbstractIndex <ReferenceType extends Reference> implements
} }
return containers; // this may return less containers as demanded return containers; // this may return less containers as demanded
} }
// methods to search in the index
/**
* collect containers for given word hashes. This collection stops if a single container does not contain any references.
* In that case only a empty result is returned.
* @param wordHashes
* @param urlselection
* @return map of wordhash:indexContainer
*/
public HashMap<byte[], ReferenceContainer<ReferenceType>> searchConjunction(final TreeSet<byte[]> wordHashes, final Set<String> urlselection) {
// first check if there is any entry that has no match; this uses only operations in ram
/*
Iterator<byte[]> i = wordHashes.iterator();
while (i.hasNext()) {
if (!this.has(i.next())); return new HashMap<byte[], ReferenceContainer<ReferenceType>>(0);
}
*/
// retrieve entities that belong to the hashes
final HashMap<byte[], ReferenceContainer<ReferenceType>> containers = new HashMap<byte[], ReferenceContainer<ReferenceType>>(wordHashes.size());
byte[] singleHash;
ReferenceContainer<ReferenceType> singleContainer;
Iterator<byte[]> i = wordHashes.iterator();
while (i.hasNext()) {
// get next word hash:
singleHash = i.next();
// retrieve index
try {
singleContainer = this.get(singleHash, urlselection);
} catch (IOException e) {
e.printStackTrace();
continue;
}
// check result
if ((singleContainer == null || singleContainer.size() == 0)) return new HashMap<byte[], ReferenceContainer<ReferenceType>>(0);
containers.put(singleHash, singleContainer);
}
return containers;
}
@SuppressWarnings("unchecked")
public HashMap<byte[], ReferenceContainer<ReferenceType>>[] searchTerm(
final TreeSet<byte[]> queryHashes,
final TreeSet<byte[]> excludeHashes,
final Set<String> urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
HashMap<byte[], ReferenceContainer<ReferenceType>> inclusionContainers =
(queryHashes.size() == 0) ?
new HashMap<byte[], ReferenceContainer<ReferenceType>>(0) :
this.searchConjunction(queryHashes, urlselection);
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<byte[], ReferenceContainer<ReferenceType>>(0); // prevent that only a subset is returned
final HashMap<byte[], ReferenceContainer<ReferenceType>> exclusionContainers =
(inclusionContainers.size() == 0) ?
new HashMap<byte[], ReferenceContainer<ReferenceType>>(0) :
this.searchConjunction(excludeHashes, urlselection);
return new HashMap[]{inclusionContainers, exclusionContainers};
}
} }

@ -119,7 +119,7 @@ public final class plasmaSearchRankingProcess {
public void execQuery() { public void execQuery() {
long timer = System.currentTimeMillis(); long timer = System.currentTimeMillis();
this.localSearchContainerMaps = wordIndex.localSearchContainers(query.queryHashes, query.excludeHashes, null); this.localSearchContainerMaps = wordIndex.index().searchTerm(query.queryHashes, query.excludeHashes, null);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size(), System.currentTimeMillis() - timer), false); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size(), System.currentTimeMillis() - timer), false);
// join and exclude the local result // join and exclude the local result

@ -1233,7 +1233,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
readerThread.start(); readerThread.start();
DCEntry surrogate; DCEntry surrogate;
QueueEntry queueentry; QueueEntry queueentry;
while ((surrogate = reader.take()) != SurrogateReader.poison) { while ((surrogate = reader.take()) != DCEntry.poison) {
// check if url is in accepted domain // check if url is in accepted domain
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.url()); final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.url());
if (urlRejectReason != null) { if (urlRejectReason != null) {

@ -96,7 +96,7 @@ public final class plasmaWordIndex {
public static final ByteOrder wordOrder = Base64Order.enhancedCoder; public static final ByteOrder wordOrder = Base64Order.enhancedCoder;
private final BufferedIndex<WordReference> index; private final IndexCell<WordReference> index;
private final Log log; private final Log log;
private MetadataRepository metadata; private MetadataRepository metadata;
private final yacySeedDB peers; private final yacySeedDB peers;
@ -250,7 +250,7 @@ public final class plasmaWordIndex {
return this.peers; return this.peers;
} }
public BufferedIndex<WordReference> index() { public IndexCell<WordReference> index() {
return this.index; return this.index;
} }
@ -561,60 +561,6 @@ public final class plasmaWordIndex {
return newEntry; return newEntry;
} }
@SuppressWarnings("unchecked")
public HashMap<byte[], ReferenceContainer<WordReference>>[] localSearchContainers(
final TreeSet<byte[]> queryHashes,
final TreeSet<byte[]> excludeHashes,
final Set<String> urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
HashMap<byte[], ReferenceContainer<WordReference>> inclusionContainers =
(queryHashes.size() == 0) ?
new HashMap<byte[], ReferenceContainer<WordReference>>(0) :
getContainers(queryHashes, urlselection);
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<byte[], ReferenceContainer<WordReference>>(0); // prevent that only a subset is returned
final HashMap<byte[], ReferenceContainer<WordReference>> exclusionContainers =
(inclusionContainers.size() == 0) ?
new HashMap<byte[], ReferenceContainer<WordReference>>(0) :
getContainers(excludeHashes, urlselection);
return new HashMap[]{inclusionContainers, exclusionContainers};
}
/**
* collect containers for given word hashes. This collection stops if a single container does not contain any references.
* In that case only a empty result is returned.
* @param wordHashes
* @param urlselection
* @return map of wordhash:indexContainer
*/
private HashMap<byte[], ReferenceContainer<WordReference>> getContainers(final TreeSet<byte[]> wordHashes, final Set<String> urlselection) {
// retrieve entities that belong to the hashes
final HashMap<byte[], ReferenceContainer<WordReference>> containers = new HashMap<byte[], ReferenceContainer<WordReference>>(wordHashes.size());
byte[] singleHash;
ReferenceContainer<WordReference> singleContainer;
final Iterator<byte[]> i = wordHashes.iterator();
while (i.hasNext()) {
// get next word hash:
singleHash = i.next();
// retrieve index
try {
singleContainer = index.get(singleHash, urlselection);
} catch (IOException e) {
e.printStackTrace();
continue;
}
// check result
if ((singleContainer == null || singleContainer.size() == 0)) return new HashMap<byte[], ReferenceContainer<WordReference>>(0);
containers.put(singleHash, singleContainer);
}
return containers;
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox // The Cleaner class was provided as "UrldbCleaner" by Hydrox
public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) { public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) {
return new ReferenceCleaner(startHash); return new ReferenceCleaner(startHash);

Loading…
Cancel
Save