diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 9ce2f7a4a..ee13371c1 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -51,7 +51,8 @@ public class webstructure { about = null; } } - if (about != null) { + if (url != null && about != null) { + plasmaWebStructure.structureEntry sentry = sb.webStructure.references(about); if (sentry != null) { reference(prop, 0, sentry, sb.webStructure); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 5102112f3..8b752124f 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -87,6 +87,7 @@ public final class search { final String prefer = post.get("prefer", ""); final String contentdom = post.get("contentdom", "text"); final String filter = post.get("filter", ".*"); + String sitehash = post.get("sitehash", ""); if (sitehash.length() == 0) sitehash = null; String language = post.get("language", ""); if (!iso639.exists(language)) { // take language from the user agent @@ -180,7 +181,29 @@ public final class search { plasmaSearchEvent theSearch = null; if ((query.length() == 0) && (abstractSet != null)) { // this is _not_ a normal search, only a request for index abstracts - theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(Base64Order.enhancedComparator), null, rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), language, false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false, yacyURL.TLD_any_zone_filter, client, false); + theQuery = new plasmaSearchQuery( + null, + abstractSet, + new TreeSet(Base64Order.enhancedComparator), + null, + rankingProfile, + maxdist, + prefer, + plasmaSearchQuery.contentdomParser(contentdom), + language, + false, + count, + 0, + filter, + plasmaSearchQuery.SEARCHDOM_LOCAL, + null, + -1, + null, + false, + sitehash, + yacyURL.TLD_any_zone_filter, + client, + false); theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); @@ -207,7 +230,30 @@ public final class search { } else { // retrieve index containers from search request - theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, null, rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), language, false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false, yacyURL.TLD_any_zone_filter, client, false); + theQuery = new plasmaSearchQuery( + null, + queryhashes, + excludehashes, + null, + rankingProfile, + maxdist, + prefer, + plasmaSearchQuery. + contentdomParser(contentdom), + language, + false, + count, + 0, + filter, + plasmaSearchQuery.SEARCHDOM_LOCAL, + null, + -1, + constraint, + false, + sitehash, + yacyURL.TLD_any_zone_filter, + client, + false); theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); RSSFeed.channels(RSSFeed.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes), "")); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 1fb5a3290..cf33ef6fb 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -270,13 +270,19 @@ public class yacysearch { } } } + if (post.containsKey("tenant")) { + final String tenant = post.get("tenant"); + if (urlmask == null) urlmask = ".*" + tenant + ".*"; else urlmask = ".*" + tenant + urlmask; + } int site = querystring.indexOf("site:"); + String sitehash = null; if (site >= 0) { int ftb = querystring.indexOf(' ', site); if (ftb == -1) ftb = querystring.length(); String domain = querystring.substring(site + 5, ftb); query[0].remove("site:" + domain.toLowerCase()); while(domain.startsWith(".")) domain = domain.substring(1); + sitehash = yacyURL.domhash(domain); if (domain.indexOf(".") < 0) domain = "\\." + domain; // is tld if (domain.length() > 0) { if (urlmask == null) { @@ -286,10 +292,6 @@ public class yacysearch { } } } - if (post.containsKey("tenant")) { - final String tenant = post.get("tenant"); - if (urlmask == null) urlmask = ".*" + tenant + ".*"; else urlmask = ".*" + tenant + urlmask; - } if (urlmask == null || urlmask.length() == 0) urlmask = originalUrlMask; //if no urlmask was given // read the language from the language-restrict option 'lr' @@ -385,6 +387,7 @@ public class yacysearch { 20, constraint, true, + sitehash, yacyURL.TLD_any_zone_filter, client, authenticated); diff --git a/source/de/anomic/kelondro/blob/BLOBArray.java b/source/de/anomic/kelondro/blob/BLOBArray.java index 21a3c543f..104d6f4cc 100755 --- a/source/de/anomic/kelondro/blob/BLOBArray.java +++ b/source/de/anomic/kelondro/blob/BLOBArray.java @@ -37,12 +37,16 @@ import java.util.List; import java.util.TreeMap; import java.util.concurrent.CopyOnWriteArrayList; +import de.anomic.kelondro.index.Row; import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.order.NaturalOrder; import de.anomic.kelondro.order.MergeIterator; +import de.anomic.kelondro.text.ReferenceContainer; +import de.anomic.kelondro.text.ReferenceContainerCache.blobFileEntries; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.FileUtils; +import de.anomic.kelondro.util.Log; public class BLOBArray implements BLOB { @@ -533,6 +537,125 @@ public class BLOBArray implements BLOB { blobs = null; } + public File mergeMount(File f1, File f2, Row payloadrow, File newFile) throws IOException { + Log.logInfo("BLOBArray", "merging " + f1.getName() + " with " + f2.getName()); + File resultFile = mergeWorker(f1, f2, payloadrow, newFile); + if (resultFile == null) return null; + mountBLOB(resultFile); + Log.logInfo("BLOBArray", "merged " + f1.getName() + " with " + f2.getName() + " into " + resultFile); + return resultFile; + } + + private File mergeWorker(File f1, File f2, Row payloadrow, File newFile) throws IOException { + // iterate both files and write a new one + + CloneableIterator i1 = new blobFileEntries(f1, payloadrow); + CloneableIterator i2 = new blobFileEntries(f2, payloadrow); + if (!i1.hasNext()) { + if (i2.hasNext()) { + FileUtils.deletedelete(f1); + if (f2.renameTo(newFile)) return newFile; + return f2; + } else { + FileUtils.deletedelete(f1); + FileUtils.deletedelete(f2); + return null; + } + } else if (!i2.hasNext()) { + FileUtils.deletedelete(f2); + if (f1.renameTo(newFile)) return newFile; + return f1; + } + assert i1.hasNext(); + assert i2.hasNext(); + File tmpFile = new File(newFile.getParentFile(), newFile.getName() + ".tmp"); + HeapWriter writer = new HeapWriter(tmpFile, newFile, this.keylength(), this.ordering()); + merge(i1, i2, this.ordering(), writer); + try { + writer.close(true); + // we don't need the old files any more + FileUtils.deletedelete(f1); + FileUtils.deletedelete(f2); + return newFile; + } catch (IOException e) { + FileUtils.deletedelete(tmpFile); + FileUtils.deletedelete(newFile); + e.printStackTrace(); + return null; + } + } + + private static void merge(CloneableIterator i1, CloneableIterator i2, ByteOrder ordering, HeapWriter writer) throws IOException { + assert i1.hasNext(); + assert i2.hasNext(); + ReferenceContainer c1, c2, c1o, c2o; + c1 = i1.next(); + c2 = i2.next(); + int e; + while (true) { + assert c1 != null; + assert c2 != null; + e = ordering.compare(c1.getWordHash().getBytes(), c2.getWordHash().getBytes()); + if (e < 0) { + writer.add(c1.getWordHash().getBytes(), c1.exportCollection()); + if (i1.hasNext()) { + c1o = c1; + c1 = i1.next(); + assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0; + continue; + } + break; + } + if (e > 0) { + writer.add(c2.getWordHash().getBytes(), c2.exportCollection()); + if (i2.hasNext()) { + c2o = c2; + c2 = i2.next(); + assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0; + continue; + } + break; + } + assert e == 0; + // merge the entries + writer.add(c1.getWordHash().getBytes(), (c1.merge(c2)).exportCollection()); + if (i1.hasNext() && i2.hasNext()) { + c1 = i1.next(); + c2 = i2.next(); + continue; + } + if (i1.hasNext()) c1 = i1.next(); + if (i2.hasNext()) c2 = i2.next(); + break; + + } + // catch up remaining entries + assert !(i1.hasNext() && i2.hasNext()); + while (i1.hasNext()) { + //System.out.println("FLUSH REMAINING 1: " + c1.getWordHash()); + writer.add(c1.getWordHash().getBytes(), c1.exportCollection()); + if (i1.hasNext()) { + c1o = c1; + c1 = i1.next(); + assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0; + continue; + } + break; + } + while (i2.hasNext()) { + //System.out.println("FLUSH REMAINING 2: " + c2.getWordHash()); + writer.add(c2.getWordHash().getBytes(), c2.exportCollection()); + if (i2.hasNext()) { + c2o = c2; + c2 = i2.next(); + assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0; + continue; + } + break; + } + // finished with writing + } + public static void main(final String[] args) { final File f = new File("/Users/admin/blobarraytest"); diff --git a/source/de/anomic/kelondro/text/IODispatcher.java b/source/de/anomic/kelondro/text/IODispatcher.java index ac6c33bcd..91cba793a 100644 --- a/source/de/anomic/kelondro/text/IODispatcher.java +++ b/source/de/anomic/kelondro/text/IODispatcher.java @@ -1,4 +1,4 @@ -// ReferenceContainerArray.java +// IODespatcher.java // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 20.03.2009 on http://yacy.net // @@ -29,15 +29,9 @@ import java.io.IOException; import java.util.concurrent.ArrayBlockingQueue; import de.anomic.kelondro.blob.BLOBArray; -import de.anomic.kelondro.blob.HeapWriter; import de.anomic.kelondro.index.Row; -import de.anomic.kelondro.order.ByteOrder; -import de.anomic.kelondro.order.CloneableIterator; -import de.anomic.kelondro.text.ReferenceContainerCache.blobFileEntries; -import de.anomic.kelondro.util.FileUtils; /** - * merger class for files from ReferenceContainerArray. * this is a concurrent merger that can merge single files that are queued for merging. * when several ReferenceContainerArray classes host their ReferenceContainer file arrays, * they may share a single ReferenceContainerMerger object which does the sharing for all @@ -104,7 +98,7 @@ public class IODispatcher extends Thread { public synchronized void merge(File f1, File f2, BLOBArray array, Row payloadrow, File newFile) { if (mergeQueue == null || !this.isAlive()) { try { - mergeMount(f1, f2, array, payloadrow, newFile); + array.mergeMount(f1, f2, payloadrow, newFile); } catch (IOException e) { e.printStackTrace(); } @@ -116,7 +110,7 @@ public class IODispatcher extends Thread { } catch (InterruptedException e) { e.printStackTrace(); try { - mergeMount(f1, f2, array, payloadrow, newFile); + array.mergeMount(f1, f2, payloadrow, newFile); } catch (IOException ee) { ee.printStackTrace(); } @@ -189,134 +183,12 @@ public class IODispatcher extends Thread { public File merge() { try { - return mergeMount(f1, f2, array, payloadrow, newFile); + return array.mergeMount(f1, f2, payloadrow, newFile); } catch (IOException e) { e.printStackTrace(); } return null; } } - - public static File mergeMount(File f1, File f2, BLOBArray array, Row payloadrow, File newFile) throws IOException { - System.out.println("*** DEBUG mergeOldest: vvvvvvvvv array has " + array.entries() + " entries vvvvvvvvv"); - System.out.println("*** DEBUG mergeOldest: unmounted " + f1.getName()); - System.out.println("*** DEBUG mergeOldest: unmounted " + f2.getName()); - File resultFile = mergeWorker(f1, f2, array, payloadrow, newFile); - if (resultFile == null) return null; - array.mountBLOB(resultFile); - System.out.println("*** DEBUG mergeOldest: mounted " + newFile.getName()); - System.out.println("*** DEBUG mergeOldest: ^^^^^^^^^^^ array has " + array.entries() + " entries ^^^^^^^^^^^"); - return resultFile; - } - - private static File mergeWorker(File f1, File f2, BLOBArray array, Row payloadrow, File newFile) throws IOException { - // iterate both files and write a new one - - CloneableIterator i1 = new blobFileEntries(f1, payloadrow); - CloneableIterator i2 = new blobFileEntries(f2, payloadrow); - if (!i1.hasNext()) { - if (i2.hasNext()) { - FileUtils.deletedelete(f1); - if (f2.renameTo(newFile)) return newFile; - return f2; - } else { - FileUtils.deletedelete(f1); - FileUtils.deletedelete(f2); - return null; - } - } else if (!i2.hasNext()) { - FileUtils.deletedelete(f2); - if (f1.renameTo(newFile)) return newFile; - return f1; - } - assert i1.hasNext(); - assert i2.hasNext(); - File tmpFile = new File(newFile.getParentFile(), newFile.getName() + ".tmp"); - HeapWriter writer = new HeapWriter(tmpFile, newFile, array.keylength(), array.ordering()); - merge(i1, i2, array.ordering(), writer); - try { - writer.close(true); - // we don't need the old files any more - FileUtils.deletedelete(f1); - FileUtils.deletedelete(f2); - return newFile; - } catch (IOException e) { - FileUtils.deletedelete(tmpFile); - FileUtils.deletedelete(newFile); - e.printStackTrace(); - return null; - } - } - - private static void merge(CloneableIterator i1, CloneableIterator i2, ByteOrder ordering, HeapWriter writer) throws IOException { - assert i1.hasNext(); - assert i2.hasNext(); - ReferenceContainer c1, c2, c1o, c2o; - c1 = i1.next(); - c2 = i2.next(); - int e; - while (true) { - assert c1 != null; - assert c2 != null; - e = ordering.compare(c1.getWordHash().getBytes(), c2.getWordHash().getBytes()); - if (e < 0) { - writer.add(c1.getWordHash().getBytes(), c1.exportCollection()); - if (i1.hasNext()) { - c1o = c1; - c1 = i1.next(); - assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0; - continue; - } - break; - } - if (e > 0) { - writer.add(c2.getWordHash().getBytes(), c2.exportCollection()); - if (i2.hasNext()) { - c2o = c2; - c2 = i2.next(); - assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0; - continue; - } - break; - } - assert e == 0; - // merge the entries - writer.add(c1.getWordHash().getBytes(), (c1.merge(c2)).exportCollection()); - if (i1.hasNext() && i2.hasNext()) { - c1 = i1.next(); - c2 = i2.next(); - continue; - } - if (i1.hasNext()) c1 = i1.next(); - if (i2.hasNext()) c2 = i2.next(); - break; - - } - // catch up remaining entries - assert !(i1.hasNext() && i2.hasNext()); - while (i1.hasNext()) { - //System.out.println("FLUSH REMAINING 1: " + c1.getWordHash()); - writer.add(c1.getWordHash().getBytes(), c1.exportCollection()); - if (i1.hasNext()) { - c1o = c1; - c1 = i1.next(); - assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0; - continue; - } - break; - } - while (i2.hasNext()) { - //System.out.println("FLUSH REMAINING 2: " + c2.getWordHash()); - writer.add(c2.getWordHash().getBytes(), c2.exportCollection()); - if (i2.hasNext()) { - c2o = c2; - c2 = i2.next(); - assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0; - continue; - } - break; - } - // finished with writing - } } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index e735ce784..c0eebde90 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -70,7 +70,8 @@ public final class plasmaSearchQuery { public boolean allofconstraint; public boolean onlineSnippetFetch; public plasmaSearchRankingProfile ranking; - public String host; + public String host; // this is the client host that starts the query, not a site operator + public String sitehash; // this is a domain hash, 6 bytes long or null public yacySeed remotepeer; public Long handle; // values that are set after a search: @@ -109,6 +110,7 @@ public final class plasmaSearchQuery { this.allofconstraint = false; this.onlineSnippetFetch = false; this.host = null; + this.sitehash = null; this.remotepeer = null; this.handle = Long.valueOf(System.currentTimeMillis()); this.specialRights = false; @@ -125,6 +127,7 @@ public final class plasmaSearchQuery { final int lines, final int offset, final String urlMask, final int domType, final String domGroupName, final int domMaxTargets, final Bitfield constraint, final boolean allofconstraint, + final String site, final int domainzone, final String host, final boolean specialRights) { @@ -146,6 +149,7 @@ public final class plasmaSearchQuery { this.domMaxTargets = domMaxTargets; this.constraint = constraint; this.allofconstraint = allofconstraint; + this.sitehash = site; assert site == null || site.length() == 6; this.onlineSnippetFetch = onlineSnippetFetch; this.host = host; this.remotepeer = null; diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index b1514db82..3b5a3096e 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -74,7 +74,11 @@ public final class plasmaSearchRankingProcess { private HashMap[] localSearchContainerMaps; private final int[] domZones; - public plasmaSearchRankingProcess(final plasmaWordIndex wordIndex, final plasmaSearchQuery query, final int maxentries, final int concurrency) { + public plasmaSearchRankingProcess( + final plasmaWordIndex wordIndex, + final plasmaSearchQuery query, + final int maxentries, + final int concurrency) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime // sortorder: 0 = hash, 1 = url, 2 = ranking @@ -183,6 +187,11 @@ public final class plasmaSearchRankingProcess { continue; } + // check site constraints + if (query.sitehash != null && !iEntry.urlHash().substring(6).equals(query.sitehash)) { + // filter out all domains that do not match with the site constraint + } + // count domZones /* indexURLEntry uentry = wordIndex.loadedURL.load(iEntry.urlHash, iEntry, 0); // this eats up a lot of time!!! diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 1b8af4914..186ece738 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -579,13 +579,15 @@ public final class plasmaWordIndex { // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result // retrieve entities that belong to the hashes - HashMap inclusionContainers = (queryHashes.size() == 0) ? new HashMap(0) : getContainers( - queryHashes, - urlselection); + HashMap inclusionContainers = + (queryHashes.size() == 0) ? + new HashMap(0) : + getContainers(queryHashes, urlselection); if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap(0); // prevent that only a subset is returned - final HashMap exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap(0) : getContainers( - excludeHashes, - urlselection); + final HashMap exclusionContainers = + (inclusionContainers.size() == 0) ? + new HashMap(0) : + getContainers(excludeHashes, urlselection); return new HashMap[]{inclusionContainers, exclusionContainers}; } diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index 40e0a6cf6..34a87a593 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -42,9 +42,7 @@ import de.anomic.tools.Punycode; import de.anomic.tools.Punycode.PunycodeException; public class yacyURL implements Serializable { - /** - * generated with svn4751 on 2008-05-01 - */ + private static final long serialVersionUID = -1173233022912141884L; public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(?