- added fast site-operator

- refactoring merge into BLOBArray

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5770 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent b4126432bc
commit 7ba078daa1

@ -51,7 +51,8 @@ public class webstructure {
about = null;
}
}
if (about != null) {
if (url != null && about != null) {
plasmaWebStructure.structureEntry sentry = sb.webStructure.references(about);
if (sentry != null) {
reference(prop, 0, sentry, sb.webStructure);

@ -87,6 +87,7 @@ public final class search {
final String prefer = post.get("prefer", "");
final String contentdom = post.get("contentdom", "text");
final String filter = post.get("filter", ".*");
String sitehash = post.get("sitehash", ""); if (sitehash.length() == 0) sitehash = null;
String language = post.get("language", "");
if (!iso639.exists(language)) {
// take language from the user agent
@ -180,7 +181,29 @@ public final class search {
plasmaSearchEvent theSearch = null;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet<String>(Base64Order.enhancedComparator), null, rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), language, false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false, yacyURL.TLD_any_zone_filter, client, false);
theQuery = new plasmaSearchQuery(
null,
abstractSet,
new TreeSet<String>(Base64Order.enhancedComparator),
null,
rankingProfile,
maxdist,
prefer,
plasmaSearchQuery.contentdomParser(contentdom),
language,
false,
count,
0,
filter,
plasmaSearchQuery.SEARCHDOM_LOCAL,
null,
-1,
null,
false,
sitehash,
yacyURL.TLD_any_zone_filter,
client,
false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
@ -207,7 +230,30 @@ public final class search {
} else {
// retrieve index containers from search request
theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, null, rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), language, false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false, yacyURL.TLD_any_zone_filter, client, false);
theQuery = new plasmaSearchQuery(
null,
queryhashes,
excludehashes,
null,
rankingProfile,
maxdist,
prefer,
plasmaSearchQuery.
contentdomParser(contentdom),
language,
false,
count,
0,
filter,
plasmaSearchQuery.SEARCHDOM_LOCAL,
null,
-1,
constraint,
false,
sitehash,
yacyURL.TLD_any_zone_filter,
client,
false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
RSSFeed.channels(RSSFeed.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes), ""));

@ -270,13 +270,19 @@ public class yacysearch {
}
}
}
if (post.containsKey("tenant")) {
final String tenant = post.get("tenant");
if (urlmask == null) urlmask = ".*" + tenant + ".*"; else urlmask = ".*" + tenant + urlmask;
}
int site = querystring.indexOf("site:");
String sitehash = null;
if (site >= 0) {
int ftb = querystring.indexOf(' ', site);
if (ftb == -1) ftb = querystring.length();
String domain = querystring.substring(site + 5, ftb);
query[0].remove("site:" + domain.toLowerCase());
while(domain.startsWith(".")) domain = domain.substring(1);
sitehash = yacyURL.domhash(domain);
if (domain.indexOf(".") < 0) domain = "\\." + domain; // is tld
if (domain.length() > 0) {
if (urlmask == null) {
@ -286,10 +292,6 @@ public class yacysearch {
}
}
}
if (post.containsKey("tenant")) {
final String tenant = post.get("tenant");
if (urlmask == null) urlmask = ".*" + tenant + ".*"; else urlmask = ".*" + tenant + urlmask;
}
if (urlmask == null || urlmask.length() == 0) urlmask = originalUrlMask; //if no urlmask was given
// read the language from the language-restrict option 'lr'
@ -385,6 +387,7 @@ public class yacysearch {
20,
constraint,
true,
sitehash,
yacyURL.TLD_any_zone_filter,
client,
authenticated);

@ -37,12 +37,16 @@ import java.util.List;
import java.util.TreeMap;
import java.util.concurrent.CopyOnWriteArrayList;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.order.MergeIterator;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerCache.blobFileEntries;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log;
public class BLOBArray implements BLOB {
@ -533,6 +537,125 @@ public class BLOBArray implements BLOB {
blobs = null;
}
public File mergeMount(File f1, File f2, Row payloadrow, File newFile) throws IOException {
Log.logInfo("BLOBArray", "merging " + f1.getName() + " with " + f2.getName());
File resultFile = mergeWorker(f1, f2, payloadrow, newFile);
if (resultFile == null) return null;
mountBLOB(resultFile);
Log.logInfo("BLOBArray", "merged " + f1.getName() + " with " + f2.getName() + " into " + resultFile);
return resultFile;
}
private File mergeWorker(File f1, File f2, Row payloadrow, File newFile) throws IOException {
// iterate both files and write a new one
CloneableIterator<ReferenceContainer> i1 = new blobFileEntries(f1, payloadrow);
CloneableIterator<ReferenceContainer> i2 = new blobFileEntries(f2, payloadrow);
if (!i1.hasNext()) {
if (i2.hasNext()) {
FileUtils.deletedelete(f1);
if (f2.renameTo(newFile)) return newFile;
return f2;
} else {
FileUtils.deletedelete(f1);
FileUtils.deletedelete(f2);
return null;
}
} else if (!i2.hasNext()) {
FileUtils.deletedelete(f2);
if (f1.renameTo(newFile)) return newFile;
return f1;
}
assert i1.hasNext();
assert i2.hasNext();
File tmpFile = new File(newFile.getParentFile(), newFile.getName() + ".tmp");
HeapWriter writer = new HeapWriter(tmpFile, newFile, this.keylength(), this.ordering());
merge(i1, i2, this.ordering(), writer);
try {
writer.close(true);
// we don't need the old files any more
FileUtils.deletedelete(f1);
FileUtils.deletedelete(f2);
return newFile;
} catch (IOException e) {
FileUtils.deletedelete(tmpFile);
FileUtils.deletedelete(newFile);
e.printStackTrace();
return null;
}
}
private static void merge(CloneableIterator<ReferenceContainer> i1, CloneableIterator<ReferenceContainer> i2, ByteOrder ordering, HeapWriter writer) throws IOException {
assert i1.hasNext();
assert i2.hasNext();
ReferenceContainer c1, c2, c1o, c2o;
c1 = i1.next();
c2 = i2.next();
int e;
while (true) {
assert c1 != null;
assert c2 != null;
e = ordering.compare(c1.getWordHash().getBytes(), c2.getWordHash().getBytes());
if (e < 0) {
writer.add(c1.getWordHash().getBytes(), c1.exportCollection());
if (i1.hasNext()) {
c1o = c1;
c1 = i1.next();
assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0;
continue;
}
break;
}
if (e > 0) {
writer.add(c2.getWordHash().getBytes(), c2.exportCollection());
if (i2.hasNext()) {
c2o = c2;
c2 = i2.next();
assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0;
continue;
}
break;
}
assert e == 0;
// merge the entries
writer.add(c1.getWordHash().getBytes(), (c1.merge(c2)).exportCollection());
if (i1.hasNext() && i2.hasNext()) {
c1 = i1.next();
c2 = i2.next();
continue;
}
if (i1.hasNext()) c1 = i1.next();
if (i2.hasNext()) c2 = i2.next();
break;
}
// catch up remaining entries
assert !(i1.hasNext() && i2.hasNext());
while (i1.hasNext()) {
//System.out.println("FLUSH REMAINING 1: " + c1.getWordHash());
writer.add(c1.getWordHash().getBytes(), c1.exportCollection());
if (i1.hasNext()) {
c1o = c1;
c1 = i1.next();
assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0;
continue;
}
break;
}
while (i2.hasNext()) {
//System.out.println("FLUSH REMAINING 2: " + c2.getWordHash());
writer.add(c2.getWordHash().getBytes(), c2.exportCollection());
if (i2.hasNext()) {
c2o = c2;
c2 = i2.next();
assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0;
continue;
}
break;
}
// finished with writing
}
public static void main(final String[] args) {
final File f = new File("/Users/admin/blobarraytest");

@ -1,4 +1,4 @@
// ReferenceContainerArray.java
// IODespatcher.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 20.03.2009 on http://yacy.net
//
@ -29,15 +29,9 @@ import java.io.IOException;
import java.util.concurrent.ArrayBlockingQueue;
import de.anomic.kelondro.blob.BLOBArray;
import de.anomic.kelondro.blob.HeapWriter;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.text.ReferenceContainerCache.blobFileEntries;
import de.anomic.kelondro.util.FileUtils;
/**
* merger class for files from ReferenceContainerArray.
* this is a concurrent merger that can merge single files that are queued for merging.
* when several ReferenceContainerArray classes host their ReferenceContainer file arrays,
* they may share a single ReferenceContainerMerger object which does the sharing for all
@ -104,7 +98,7 @@ public class IODispatcher extends Thread {
public synchronized void merge(File f1, File f2, BLOBArray array, Row payloadrow, File newFile) {
if (mergeQueue == null || !this.isAlive()) {
try {
mergeMount(f1, f2, array, payloadrow, newFile);
array.mergeMount(f1, f2, payloadrow, newFile);
} catch (IOException e) {
e.printStackTrace();
}
@ -116,7 +110,7 @@ public class IODispatcher extends Thread {
} catch (InterruptedException e) {
e.printStackTrace();
try {
mergeMount(f1, f2, array, payloadrow, newFile);
array.mergeMount(f1, f2, payloadrow, newFile);
} catch (IOException ee) {
ee.printStackTrace();
}
@ -189,134 +183,12 @@ public class IODispatcher extends Thread {
public File merge() {
try {
return mergeMount(f1, f2, array, payloadrow, newFile);
return array.mergeMount(f1, f2, payloadrow, newFile);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}
public static File mergeMount(File f1, File f2, BLOBArray array, Row payloadrow, File newFile) throws IOException {
System.out.println("*** DEBUG mergeOldest: vvvvvvvvv array has " + array.entries() + " entries vvvvvvvvv");
System.out.println("*** DEBUG mergeOldest: unmounted " + f1.getName());
System.out.println("*** DEBUG mergeOldest: unmounted " + f2.getName());
File resultFile = mergeWorker(f1, f2, array, payloadrow, newFile);
if (resultFile == null) return null;
array.mountBLOB(resultFile);
System.out.println("*** DEBUG mergeOldest: mounted " + newFile.getName());
System.out.println("*** DEBUG mergeOldest: ^^^^^^^^^^^ array has " + array.entries() + " entries ^^^^^^^^^^^");
return resultFile;
}
private static File mergeWorker(File f1, File f2, BLOBArray array, Row payloadrow, File newFile) throws IOException {
// iterate both files and write a new one
CloneableIterator<ReferenceContainer> i1 = new blobFileEntries(f1, payloadrow);
CloneableIterator<ReferenceContainer> i2 = new blobFileEntries(f2, payloadrow);
if (!i1.hasNext()) {
if (i2.hasNext()) {
FileUtils.deletedelete(f1);
if (f2.renameTo(newFile)) return newFile;
return f2;
} else {
FileUtils.deletedelete(f1);
FileUtils.deletedelete(f2);
return null;
}
} else if (!i2.hasNext()) {
FileUtils.deletedelete(f2);
if (f1.renameTo(newFile)) return newFile;
return f1;
}
assert i1.hasNext();
assert i2.hasNext();
File tmpFile = new File(newFile.getParentFile(), newFile.getName() + ".tmp");
HeapWriter writer = new HeapWriter(tmpFile, newFile, array.keylength(), array.ordering());
merge(i1, i2, array.ordering(), writer);
try {
writer.close(true);
// we don't need the old files any more
FileUtils.deletedelete(f1);
FileUtils.deletedelete(f2);
return newFile;
} catch (IOException e) {
FileUtils.deletedelete(tmpFile);
FileUtils.deletedelete(newFile);
e.printStackTrace();
return null;
}
}
private static void merge(CloneableIterator<ReferenceContainer> i1, CloneableIterator<ReferenceContainer> i2, ByteOrder ordering, HeapWriter writer) throws IOException {
assert i1.hasNext();
assert i2.hasNext();
ReferenceContainer c1, c2, c1o, c2o;
c1 = i1.next();
c2 = i2.next();
int e;
while (true) {
assert c1 != null;
assert c2 != null;
e = ordering.compare(c1.getWordHash().getBytes(), c2.getWordHash().getBytes());
if (e < 0) {
writer.add(c1.getWordHash().getBytes(), c1.exportCollection());
if (i1.hasNext()) {
c1o = c1;
c1 = i1.next();
assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0;
continue;
}
break;
}
if (e > 0) {
writer.add(c2.getWordHash().getBytes(), c2.exportCollection());
if (i2.hasNext()) {
c2o = c2;
c2 = i2.next();
assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0;
continue;
}
break;
}
assert e == 0;
// merge the entries
writer.add(c1.getWordHash().getBytes(), (c1.merge(c2)).exportCollection());
if (i1.hasNext() && i2.hasNext()) {
c1 = i1.next();
c2 = i2.next();
continue;
}
if (i1.hasNext()) c1 = i1.next();
if (i2.hasNext()) c2 = i2.next();
break;
}
// catch up remaining entries
assert !(i1.hasNext() && i2.hasNext());
while (i1.hasNext()) {
//System.out.println("FLUSH REMAINING 1: " + c1.getWordHash());
writer.add(c1.getWordHash().getBytes(), c1.exportCollection());
if (i1.hasNext()) {
c1o = c1;
c1 = i1.next();
assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0;
continue;
}
break;
}
while (i2.hasNext()) {
//System.out.println("FLUSH REMAINING 2: " + c2.getWordHash());
writer.add(c2.getWordHash().getBytes(), c2.exportCollection());
if (i2.hasNext()) {
c2o = c2;
c2 = i2.next();
assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0;
continue;
}
break;
}
// finished with writing
}
}

@ -70,7 +70,8 @@ public final class plasmaSearchQuery {
public boolean allofconstraint;
public boolean onlineSnippetFetch;
public plasmaSearchRankingProfile ranking;
public String host;
public String host; // this is the client host that starts the query, not a site operator
public String sitehash; // this is a domain hash, 6 bytes long or null
public yacySeed remotepeer;
public Long handle;
// values that are set after a search:
@ -109,6 +110,7 @@ public final class plasmaSearchQuery {
this.allofconstraint = false;
this.onlineSnippetFetch = false;
this.host = null;
this.sitehash = null;
this.remotepeer = null;
this.handle = Long.valueOf(System.currentTimeMillis());
this.specialRights = false;
@ -125,6 +127,7 @@ public final class plasmaSearchQuery {
final int lines, final int offset, final String urlMask,
final int domType, final String domGroupName, final int domMaxTargets,
final Bitfield constraint, final boolean allofconstraint,
final String site,
final int domainzone,
final String host,
final boolean specialRights) {
@ -146,6 +149,7 @@ public final class plasmaSearchQuery {
this.domMaxTargets = domMaxTargets;
this.constraint = constraint;
this.allofconstraint = allofconstraint;
this.sitehash = site; assert site == null || site.length() == 6;
this.onlineSnippetFetch = onlineSnippetFetch;
this.host = host;
this.remotepeer = null;

@ -74,7 +74,11 @@ public final class plasmaSearchRankingProcess {
private HashMap<String, ReferenceContainer>[] localSearchContainerMaps;
private final int[] domZones;
public plasmaSearchRankingProcess(final plasmaWordIndex wordIndex, final plasmaSearchQuery query, final int maxentries, final int concurrency) {
public plasmaSearchRankingProcess(
final plasmaWordIndex wordIndex,
final plasmaSearchQuery query,
final int maxentries,
final int concurrency) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
// sortorder: 0 = hash, 1 = url, 2 = ranking
@ -183,6 +187,11 @@ public final class plasmaSearchRankingProcess {
continue;
}
// check site constraints
if (query.sitehash != null && !iEntry.urlHash().substring(6).equals(query.sitehash)) {
// filter out all domains that do not match with the site constraint
}
// count domZones
/*
indexURLEntry uentry = wordIndex.loadedURL.load(iEntry.urlHash, iEntry, 0); // this eats up a lot of time!!!

@ -579,13 +579,15 @@ public final class plasmaWordIndex {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
HashMap<String, ReferenceContainer> inclusionContainers = (queryHashes.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
queryHashes,
urlselection);
HashMap<String, ReferenceContainer> inclusionContainers =
(queryHashes.size() == 0) ?
new HashMap<String, ReferenceContainer>(0) :
getContainers(queryHashes, urlselection);
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<String, ReferenceContainer>(0); // prevent that only a subset is returned
final HashMap<String, ReferenceContainer> exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
excludeHashes,
urlselection);
final HashMap<String, ReferenceContainer> exclusionContainers =
(inclusionContainers.size() == 0) ?
new HashMap<String, ReferenceContainer>(0) :
getContainers(excludeHashes, urlselection);
return new HashMap[]{inclusionContainers, exclusionContainers};
}

@ -42,9 +42,7 @@ import de.anomic.tools.Punycode;
import de.anomic.tools.Punycode.PunycodeException;
public class yacyURL implements Serializable {
/**
* generated with svn4751 on 2008-05-01
*/
private static final long serialVersionUID = -1173233022912141884L;
public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(?<!/\\.{1,2})/)[.]{2}(?=/|$)|/\\.(?=/)|/(?=/)");
@ -56,6 +54,18 @@ public class yacyURL implements Serializable {
private String protocol, host, userInfo, path, quest, ref, hash;
private int port;
public static String domhash(String host) {
if (!host.startsWith("http://")) host = "http://" + host;
yacyURL url = null;
try {
url = new yacyURL(host, null);
} catch (MalformedURLException e) {
e.printStackTrace();
return null;
}
return (url == null) ? null : url.hash().substring(6);
}
public yacyURL(final String url, final String hash) throws MalformedURLException {
if (url == null) throw new MalformedURLException("url string is null");

Loading…
Cancel
Save