From f4ffa9aee571c141cf41f949c0989fdd6e1d71d4 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 20 Jan 2006 15:14:21 +0000 Subject: [PATCH] - implemented more attributes to index entries - implemented hand-over of new word index attributes during remote search - implemented word-distance computation during search git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1382 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControl_p.java | 12 +- htroot/ViewFile.java | 2 +- htroot/yacy/crawlOrder.java | 2 +- htroot/yacy/search.java | 86 ++++++++- source/de/anomic/plasma/plasmaCrawlLURL.java | 180 +++++++----------- source/de/anomic/plasma/plasmaDbImporter.java | 2 +- .../de/anomic/plasma/plasmaSearchEvent.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 85 +-------- .../anomic/plasma/plasmaSwitchboardQueue.java | 2 +- source/de/anomic/plasma/plasmaURLPool.java | 2 +- source/de/anomic/plasma/plasmaWordIndex.java | 3 + .../plasma/plasmaWordIndexAssortment.java | 2 +- .../plasma/plasmaWordIndexDistribution.java | 4 +- .../anomic/plasma/plasmaWordIndexEntity.java | 26 ++- .../anomic/plasma/plasmaWordIndexEntry.java | 156 +++++++-------- source/de/anomic/server/serverObjects.java | 3 - source/de/anomic/yacy/yacyClient.java | 32 ++-- source/yacy.java | 4 +- 18 files changed, 294 insertions(+), 311 deletions(-) diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 63307112f..f6cb81323 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -218,7 +218,7 @@ public class IndexControl_p { if (post.containsKey("urlhashdelete")) { try { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null); URL url = entry.url(); urlstring = htmlFilterContentScraper.urlNormalform(url); prop.put("urlstring", ""); @@ -268,7 +268,7 @@ public class IndexControl_p { while (urlIter.hasNext()) { indexEntry = (plasmaWordIndexEntry) urlIter.next(); try { - lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); + lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null); if (lurl.toString() == null) { switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash()); unknownURLEntries.add(indexEntry.getUrlHash()); @@ -321,7 +321,7 @@ public class IndexControl_p { URL url = new URL(urlstring); urlhash = plasmaURL.urlHash(url); prop.put("urlhash", urlhash); - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null); prop.put("result", genUrlProfile(switchboard, entry, urlhash)); } catch (MalformedURLException e) { prop.put("urlstring", "bad url: " + urlstring); @@ -334,7 +334,7 @@ public class IndexControl_p { if (post.containsKey("urlhashsearch")) { try { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null); URL url = entry.url(); urlstring = url.toString(); prop.put("urlstring", urlstring); @@ -395,7 +395,7 @@ public class IndexControl_p { URL url = entry.url(); String referrer = null; try { - referrer = switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url().toString(); + referrer = switchboard.urlPool.loadedURL.getEntry(entry.referrerHash(), null).url().toString(); } catch (IOException e) { referrer = ""; } @@ -452,7 +452,7 @@ public class IndexControl_p { xi = (plasmaWordIndexEntry) en.next(); uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())}; try { - us = switchboard.urlPool.loadedURL.getEntry(uh[0]).url().toString(); + us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString(); tm.put(us, uh); } catch (IOException e) { tm.put(uh[0], uh); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 048ff9cfe..8ac1dcf9a 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -104,7 +104,7 @@ public class ViewFile { // getting the urlEntry that belongs to the url hash Entry urlEntry = null; try { - urlEntry = sb.urlPool.loadedURL.getEntry(urlHash); + urlEntry = sb.urlPool.loadedURL.getEntry(urlHash, null); } catch (IOException e) { prop.put("error",2); prop.put("viewMode",VIEW_MODE_NO_TEXT); diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index 093c46398..176e34952 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -245,7 +245,7 @@ public final class crawlOrder { reason = reasonString; // send lurl-Entry as response try { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaURL.urlHash(url)); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaURL.urlHash(url), null); response = "double"; switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare); lurl = crypt.simpleEncode(entry.toString()); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 700837b89..93c2940cf 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -47,11 +47,17 @@ // javac -classpath .:../../Classes search.java // if the shell's current path is htroot/yacy +import java.io.IOException; import java.util.HashSet; import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaSearchEvent; +import de.anomic.plasma.plasmaSearchResult; +import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaSearchQuery; +import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -64,9 +70,7 @@ public final class search { // return variable that accumulates replacements final plasmaSwitchboard sb = (plasmaSwitchboard) ss; - serverObjects prop = new serverObjects(); - if (prop == null || sb == null) { return null; } - + //System.out.println("yacy: search received request = " + post.toString()); final String oseed = post.get("myseed", ""); // complete seed of the requesting peer @@ -79,12 +83,20 @@ public final class search { final int count = post.getInt("count", 10); // maximum number of wanted results // final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers // Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time + + + // tell all threads to do nothing for a specific time + sb.wordIndex.intermission(2 * duetime); + sb.intermissionAllThreads(2 * duetime); + + // store accessing peer if (yacyCore.seedDB == null) { yacyCore.log.logSevere("yacy.search: seed cache not initialized"); } else { yacyCore.peerActions.peerArrival(yacySeed.genRemoteSeed(oseed, key), true); } + // prepare search final HashSet keyhashes = new HashSet(query.length() / plasmaWordIndexEntry.wordHashLength); for (int i = 0; i < (query.length() / plasmaWordIndexEntry.wordHashLength); i++) { keyhashes.add(query.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength)); @@ -92,9 +104,73 @@ public final class search { final long timestamp = System.currentTimeMillis(); plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY}, - count, duetime, ".*"); + count, duetime, ".*"); + squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; + + serverObjects prop = new serverObjects(); + + yacyCore.log.logInfo("INIT HASH SEARCH: " + squery.queryHashes + " - " + squery.wantedResults + " links"); + long timestamp1 = System.currentTimeMillis(); + plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache); + plasmaSearchResult acc = null; + int idxc = 0; + try { + idxc = theSearch.localSearch(); + acc = theSearch.order(); + } catch (IOException e) { + } + + // result is a List of urlEntry elements + if ((idxc == 0) || (acc == null)) { + prop.put("totalcount", "0"); + prop.put("linkcount", "0"); + prop.put("references", ""); + } else { + prop.put("totalcount", Integer.toString(acc.sizeOrdered())); + int i = 0; + StringBuffer links = new StringBuffer(); + String resource = ""; + //plasmaIndexEntry pie; + plasmaCrawlLURL.Entry urlentry; + plasmaSnippetCache.result snippet; + while ((acc.hasMoreElements()) && (i < squery.wantedResults)) { + urlentry = acc.nextElement(); + snippet = sb.snippetCache.retrieve(urlentry.url(), squery.queryHashes, false, 260); + if (snippet.source == plasmaSnippetCache.ERROR_NO_MATCH) { + // suppress line: there is no match in that resource + } else { + if (snippet.line == null) { + resource = urlentry.toString(); + } else { + resource = urlentry.toString(snippet.line); + } + if (resource != null) { + links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString); + i++; + } + } + } + prop.put("links", links.toString()); + prop.put("linkcount", Integer.toString(i)); + + // prepare reference hints + Object[] ws = acc.getReferences(16); + StringBuffer refstr = new StringBuffer(); + for (int j = 0; j < ws.length; j++) + refstr.append(",").append((String) ws[j]); + prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString()); + + // add information about forward peers + prop.put("fwhop", ""); // hops (depth) of forwards that had been performed to construct this result + prop.put("fwsrc", ""); // peers that helped to construct this result + prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations) + + + } - prop = sb.searchFromRemote(squery); + // log + yacyCore.log.logInfo("EXIT HASH SEARCH: " + squery.queryHashes + " - " + idxc + " links found, " + prop.get("linkcount", "?") + " links selected, " + ((System.currentTimeMillis() - timestamp1) / 1000) + " seconds"); + prop.put("searchtime", Long.toString(System.currentTimeMillis() - timestamp)); final int links = Integer.parseInt(prop.get("linkcount","0")); diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index f85ea0b75..462957c8a 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -151,8 +151,8 @@ public final class plasmaCrawlLURL extends plasmaURL { return e; } - public synchronized Entry addEntry(Entry e, String initiatorHash, String executorHash, int stackType) { - if (e == null) { return null; } + public synchronized void addEntry(Entry e, String initiatorHash, String executorHash, int stackType) { + if (e == null) { return; } try { if (initiatorHash == null) { initiatorHash = dummyHash; } if (executorHash == null) { executorHash = dummyHash; } @@ -165,10 +165,10 @@ public final class plasmaCrawlLURL extends plasmaURL { case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; } - return e; + return; } catch (Exception ex) { System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); - return null; + return; } } @@ -176,27 +176,14 @@ public final class plasmaCrawlLURL extends plasmaURL { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } - public synchronized Entry getEntry(String hash) throws IOException { - return new Entry(hash); + public synchronized Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException { + return new Entry(hash, searchedWord); } public synchronized Entry newEntry(Entry oldEntry) { if (oldEntry == null) return null; - /* - * de.anomic.plasma.plasmaCrawlLURL.Entry.Entry(URL url, String descr, - * Date moddate, Date loaddate, - * String referrerHash, - * int copyCount, - * boolean localNeed, - * int quality, - * String language, - * char doctype, - * long size, - * int wordCount) - */ return new Entry( oldEntry.url(), - oldEntry.hash(), oldEntry.descr(), oldEntry.moddate(), oldEntry.loaddate(), @@ -209,7 +196,7 @@ public final class plasmaCrawlLURL extends plasmaURL { oldEntry.size(), oldEntry.wordCount()); } - + public synchronized Entry newEntry(String propStr, boolean setGlobal) { if (propStr.startsWith("{") && propStr.endsWith("}")) { return new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal); @@ -356,7 +343,7 @@ public final class plasmaCrawlLURL extends plasmaURL { urlHash = getUrlHash(tabletype, i); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); try { - urle = getEntry(urlHash); + urle = getEntry(urlHash, null); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); executorSeed = yacyCore.seedDB.getConnected(executorHash); @@ -397,72 +384,44 @@ public final class plasmaCrawlLURL extends plasmaURL { public class Entry { - private URL url; - private String descr; - private Date moddate; - private Date loaddate; - private String urlHash; - private String referrerHash; - private int copyCount; - private String flags; - private int quality; - private String language; - private char doctype; - private long size; - private int wordCount; - private String snippet; - - public Entry( - URL url, - String descr, - Date moddate, - Date loaddate, - String referrerHash, - int copyCount, - boolean localNeed, - int quality, - String language, - char doctype, - long size, - int wordCount - ) { - this(url,null,descr,moddate,loaddate,referrerHash,copyCount,localNeed,quality,language,doctype,size,wordCount); - } - - Entry( - URL url, - String theUrlHash, - String descr, - Date moddate, - Date loaddate, - String referrerHash, - int copyCount, - boolean localNeed, - int quality, - String language, - char doctype, - long size, - int wordCount - ) { - // create new entry and store it into database - this.urlHash = (theUrlHash == null) ? urlHash(url) : theUrlHash; - this.url = url; - this.descr = (descr==null)?this.url.toString():descr; - this.moddate = moddate; - this.loaddate = loaddate; - this.referrerHash = (referrerHash == null) ? dummyHash : referrerHash; - this.copyCount = copyCount; // the number of remote (global) copies of this object without this one - this.flags = (localNeed) ? "L " : " "; - this.quality = quality; - this.language = (language==null)?"uk":language; - this.doctype = doctype; - this.size = size; - this.wordCount = wordCount; - this.snippet = null; - store(); - } - - public Entry(String urlHash) throws IOException { + private URL url; + + private String descr; + private Date moddate; + private Date loaddate; + private String urlHash; + private String referrerHash; + private int copyCount; + private String flags; + private int quality; + private String language; + private char doctype; + private long size; + private int wordCount; + private String snippet; + private plasmaWordIndexEntry word; + + public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, long size, int wordCount) { + // create new entry and store it into database + this.urlHash = urlHash(url); + this.url = url; + this.descr = (descr == null) ? this.url.toString() : descr; + this.moddate = moddate; + this.loaddate = loaddate; + this.referrerHash = (referrerHash == null) ? dummyHash : referrerHash; + this.copyCount = copyCount; // the number of remote (global) copies of this object without this one + this.flags = (localNeed) ? "L " : " "; + this.quality = quality; + this.language = (language == null) ? "uk" : language; + this.doctype = doctype; + this.size = size; + this.wordCount = wordCount; + this.snippet = null; + this.word = null; + store(); + } + + public Entry(String urlHash, plasmaWordIndexEntry searchedWord) throws IOException { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -488,6 +447,7 @@ public final class plasmaCrawlLURL extends plasmaURL { this.size = kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[11], "UTF-8")); this.wordCount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[12], "UTF-8")); this.snippet = null; + this.word = searchedWord; return; } } catch (Exception e) { @@ -519,8 +479,9 @@ public final class plasmaCrawlLURL extends plasmaURL { this.doctype = prop.getProperty("dt", "t").charAt(0); this.size = Long.parseLong(prop.getProperty("size", "0")); this.wordCount = Integer.parseInt(prop.getProperty("wc", "0")); - this.snippet = prop.getProperty("snippet", ""); - if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); + this.snippet = prop.getProperty("snippet", ""); + if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); + this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null; store(); //} } catch (Exception e) { @@ -623,6 +584,10 @@ public final class plasmaCrawlLURL extends plasmaURL { return snippet; } + public plasmaWordIndexEntry word() { + return word; + } + private StringBuffer corePropList() { // generate a parseable string; this is a simple property-list final StringBuffer corePropStr = new StringBuffer(300); @@ -640,23 +605,14 @@ public final class plasmaCrawlLURL extends plasmaURL { .append(",dt=") .append(doctype) .append(",lang=") .append(language) .append(",url=") .append(crypt.simpleEncode(url.toString())) - .append(",descr=") .append(crypt.simpleEncode(descr)); + .append(",descr=") .append(crypt.simpleEncode(descr)); + + if (this.word != null) { + // append also word properties + corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toExternalForm())); + } return corePropStr; -// return -// "hash=" + urlHash + -// ",referrer=" + referrerHash + -// ",mod=" + shortDayFormatter.format(moddate) + -// ",load=" + shortDayFormatter.format(loaddate) + -// ",size=" + size + -// ",wc=" + wordCount + -// ",cc=" + copyCount + -// ",local=" + ((local()) ? "true" : "false") + -// ",q=" + serverCodings.enhancedCoder.encodeBase64Long(quality, urlQualityLength) + -// ",dt=" + doctype + -// ",lang=" + language + -// ",url=" + crypt.simpleEncode(url.toString()) + -// ",descr=" + crypt.simpleEncode(descr); } catch (Exception e) { // serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); // if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null"); @@ -666,6 +622,7 @@ public final class plasmaCrawlLURL extends plasmaURL { } } + /* public String toString(int posintext, int posinphrase, int posofphrase) { // add information needed for remote transport final StringBuffer core = corePropList(); @@ -678,15 +635,9 @@ public final class plasmaCrawlLURL extends plasmaURL { .append(",posofphraseint=").append(posofphrase) .append("}"); return core.toString(); - -// return -// "{" + core + -// ",posintext=" + posintext + -// ",posinphrase=" + posinphrase + -// ",posofphraseint=" + posofphrase + -// "}"; } - + */ + public String toString(String snippet) { // add information needed for remote transport final StringBuffer core = corePropList(); @@ -694,7 +645,8 @@ public final class plasmaCrawlLURL extends plasmaURL { core.ensureCapacity(core.length() + snippet.length()*2); core.insert(0,"{"); - core.append(",snippet=").append(crypt.simpleEncode(snippet)).append("}"); + core.append(",snippet=").append(crypt.simpleEncode(snippet)); + core.append("}"); return core.toString(); //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; @@ -751,7 +703,7 @@ public final class plasmaCrawlLURL extends plasmaURL { String hash = null; try { hash = new String(e); - return new Entry(hash); + return new Entry(hash, null); } catch (IOException ex) { throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + hash); } diff --git a/source/de/anomic/plasma/plasmaDbImporter.java b/source/de/anomic/plasma/plasmaDbImporter.java index 797151093..f570c4a8b 100644 --- a/source/de/anomic/plasma/plasmaDbImporter.java +++ b/source/de/anomic/plasma/plasmaDbImporter.java @@ -253,7 +253,7 @@ public class plasmaDbImporter extends Thread { String urlHash = importWordIdxEntry.getUrlHash(); if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) try { // importing the new url - plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash); + plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, importWordIdxEntry); urlCounter++; this.homeUrlDB.newEntry(urlEntry); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index c3d795466..9439bf120 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -236,7 +236,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { entry = preorder.next(); // find the url entry try { - page = urlStore.getEntry(entry.getUrlHash()); + page = urlStore.getEntry(entry.getUrlHash(), entry); // add a result acc.addResult(entry, page); } catch (IOException e) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 35bf5282a..5541e5791 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -134,7 +134,6 @@ import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroTables; import de.anomic.server.serverAbstractSwitch; import de.anomic.server.serverCodings; -import de.anomic.server.serverCore; import de.anomic.server.serverDate; import de.anomic.server.serverInstantThread; import de.anomic.server.serverObjects; @@ -1322,9 +1321,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash); plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash, condenser.wordCount(word), + condenser.RESULT_SIMI_WORDS, + condenser.RESULT_SIMI_SENTENCES, condenser.wordPositionInText(word), condenser.wordPositionInPhrase(word), condenser.wordNumberOfPhrase(word), + 0, docDate.getTime(), quality, language, doctype, true); wordIdxEntity.addEntry(wordIdxEntry); @@ -1575,7 +1577,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String lurl = (String) page.get("lurl"); if ((lurl != null) && (lurl.length() != 0)) { String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); - plasmaCrawlLURL.Entry entry = urlPool.loadedURL.addEntry(urlPool.loadedURL.newEntry(propStr, true), yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); + plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true); + urlPool.loadedURL.addEntry(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ŸberflŸssig/doppelt? urlPool.noticeURL.remove(entry.hash()); log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'"); return true; @@ -1760,85 +1763,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } - public serverObjects searchFromRemote(plasmaSearchQuery query) { - - // tell all threads to do nothing for a specific time - wordIndex.intermission(2 * query.maximumTime); - intermissionAllThreads(2 * query.maximumTime); - - query.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; - - serverObjects prop = new serverObjects(); - try { - log.logInfo("INIT HASH SEARCH: " + query.queryHashes + " - " + query.wantedResults + " links"); - long timestamp = System.currentTimeMillis(); - plasmaSearchEvent theSearch = new plasmaSearchEvent(query, log, wordIndex, urlPool.loadedURL, snippetCache); - int idxc = theSearch.localSearch(); - plasmaSearchResult acc = theSearch.order(); - - // result is a List of urlEntry elements - if (acc == null) { - prop.put("totalcount", "0"); - prop.put("linkcount", "0"); - prop.put("references", ""); - } else { - prop.put("totalcount", Integer.toString(acc.sizeOrdered())); - int i = 0; - StringBuffer links = new StringBuffer(); - String resource = ""; - //plasmaIndexEntry pie; - plasmaCrawlLURL.Entry urlentry; - plasmaSnippetCache.result snippet; - while ((acc.hasMoreElements()) && (i < query.wantedResults)) { - urlentry = acc.nextElement(); - snippet = snippetCache.retrieve(urlentry.url(), query.queryHashes, false, 260); - if (snippet.source == plasmaSnippetCache.ERROR_NO_MATCH) { - // suppress line: there is no match in that resource - } else { - if (snippet.line == null) { - resource = urlentry.toString(); - } else { - resource = urlentry.toString(snippet.line); - } - if (resource != null) { - links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString); - i++; - } - } - } - prop.put("links", links.toString()); - prop.put("linkcount", Integer.toString(i)); - - // prepare reference hints - Object[] ws = acc.getReferences(16); - StringBuffer refstr = new StringBuffer(); - for (int j = 0; j < ws.length; j++) refstr.append(",").append((String) ws[j]); - prop.put("references", (refstr.length() > 0)?refstr.substring(1):refstr.toString()); - } - - // add information about forward peers - prop.put("fwhop", ""); // hops (depth) of forwards that had been performed to construct this result - prop.put("fwsrc", ""); // peers that helped to construct this result - prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations) - - // log - log.logInfo("EXIT HASH SEARCH: " + query.queryHashes + " - " + idxc + " links found, " + - prop.get("linkcount", "?") + " links selected, " + - ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); - return prop; - } catch (IOException e) { - return null; - } - } - - public serverObjects action(String actionName, serverObjects actionInput) { - // perform an action. (not used) - + // perform an action. (not used) return null; } - public String toString() { // it is possible to use this method in the cgi pages. // actually it is used there for testing purpose @@ -1856,7 +1785,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // determine the url string try { - plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash, null); URL url = entry.url(); if (url == null) return 0; diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index 7c5dd713e..0af5869aa 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -290,7 +290,7 @@ public class plasmaSwitchboardQueue { if (referrerURL == null) { if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null; try { - referrerURL = lurls.getEntry(referrerHash).url(); + referrerURL = lurls.getEntry(referrerHash, null).url(); } catch (IOException e) { referrerURL = null; return null; diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java index dbd5e9a31..276c14140 100644 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -76,7 +76,7 @@ public class plasmaURLPool { plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); if (ne != null) return ne.url(); try { - plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash, null); if (le != null) return le.url(); } catch (IOException e) {} plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index d29d2be13..65519f94c 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -160,9 +160,12 @@ public final class plasmaWordIndex { wordHash = plasmaWordIndexEntry.word2hash(word); entry = new plasmaWordIndexEntry(urlHash, condenser.wordCount(word), + condenser.RESULT_SIMI_WORDS, + condenser.RESULT_SIMI_SENTENCES, condenser.wordPositionInText(word), condenser.wordPositionInPhrase(word), condenser.wordNumberOfPhrase(word), + 0, urlModified.getTime(), quality, language, doctype, true); addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), false); } diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index 3b17634c3..bf020242a 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -70,7 +70,7 @@ public final class plasmaWordIndexAssortment { 4, // occurrence counter 8, // timestamp of last access plasmaWordIndexEntry.urlHashLength, // corresponding URL hash - plasmaWordIndexEntry.attrSpaceLong // URL attributes + plasmaWordIndexEntry.attrSpace // URL attributes }; // class variables diff --git a/source/de/anomic/plasma/plasmaWordIndexDistribution.java b/source/de/anomic/plasma/plasmaWordIndexDistribution.java index dc0ca29e7..d42373630 100644 --- a/source/de/anomic/plasma/plasmaWordIndexDistribution.java +++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java @@ -359,7 +359,7 @@ public final class plasmaWordIndexDistribution { while (urlIter.hasNext()) { indexEntry = (plasmaWordIndexEntry) urlIter.next(); try { - lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); + lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry); if ((lurl == null) || (lurl.url() == null)) { unknownURLEntries.add(indexEntry.getUrlHash()); } else { @@ -399,7 +399,7 @@ public final class plasmaWordIndexDistribution { while ((urlIter.hasNext()) && (count > 0)) { indexEntry = (plasmaWordIndexEntry) urlIter.next(); try { - lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); + lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry); if ((lurl == null) || (lurl.url()==null)) { unknownURLEntries.add(indexEntry.getUrlHash()); } else { diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java index b339f0910..94ee53522 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntity.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java @@ -95,10 +95,10 @@ public final class plasmaWordIndexEntity { kt = new kelondroTree(theLocation, cacheSize); } catch (IOException e) { theLocation.delete(); - kt = new kelondroTree(theLocation, cacheSize, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpaceLong, false); + kt = new kelondroTree(theLocation, cacheSize, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpace, false); } else { // create new index file - kt = new kelondroTree(theLocation, cacheSize, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpaceLong, false); + kt = new kelondroTree(theLocation, cacheSize, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpace, false); } return kt; // everyone who get this should close it when finished! } @@ -157,6 +157,16 @@ public final class plasmaWordIndexEntity { } catch (IOException e) {} } + public plasmaWordIndexEntry getEntry(String urlhash) throws IOException { + if (theTmpMap == null) { + byte[][] n = theIndex.get(urlhash.getBytes()); + if (n == null) return null; + return new plasmaWordIndexEntry(new String(n[0]), new String(n[1])); + } else { + return (plasmaWordIndexEntry) theTmpMap.get(urlhash); + } + } + public boolean contains(String urlhash) throws IOException { if (theTmpMap == null) return (theIndex.get(urlhash.getBytes()) != null); else return (theTmpMap.containsKey(urlhash)); } @@ -390,12 +400,17 @@ public final class plasmaWordIndexEntity { System.out.println("DEBUG: JOIN METHOD BY TEST"); plasmaWordIndexEntity conj = new plasmaWordIndexEntity(null); // start with empty search result Iterator se = small.elements(true); - plasmaWordIndexEntry ie; + plasmaWordIndexEntry ie0, ie1; long stamp = System.currentTimeMillis(); try { while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) { - ie = (plasmaWordIndexEntry) se.next(); - if (large.contains(ie)) conj.addEntry(ie); + ie0 = (plasmaWordIndexEntry) se.next(); + ie1 = large.getEntry(ie0.getUrlHash()); + if (ie1 != null) { + // this is a hit. Calculate word distance: + ie0.combineDistance(ie1); + conj.addEntry(ie0); + } } } catch (kelondroException e) { //serverLog.logSevere("PLASMA", "joinConstructiveByTest: Database corrupt (" + e.getMessage() + "), deleting index"); @@ -449,6 +464,7 @@ public final class plasmaWordIndexEntity { } } else { // we have found the same urls in different searches! + ie1.combineDistance(ie2); conj.addEntry(ie1); try { if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break; diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index 32a9840e2..57d15a55f 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -67,24 +67,26 @@ public final class plasmaWordIndexEntry { public static final int urlHashLength = yacySeedDB.commonHashLength; // 12 // the size of the index entry attributes - //public static final int attrSpaceShort = 12; - public static final int attrSpaceLong = 18; + public static final int attrSpace = 24; // the associated hash private final String urlHash; // discrete values - private int count; // words in file + private int hitcount; // words in file + private int wordcount; + private int phrasecount; private int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position private int posinphrase; // position within a phrase of the word private int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text + private int worddistance; private long lastModified;// calculated by using last-modified private int quality; // result of a heuristic on the source file private byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only private char doctype; // type of source private char localflag; // indicates if the index was created locally - // some doctypes: + // doctypes: public static final char DT_PDFPS = 'p'; public static final char DT_TEXT = 't'; public static final char DT_HTML = 'h'; @@ -97,6 +99,19 @@ public final class plasmaWordIndexEntry { public static final char DT_BINARY = 'b'; public static final char DT_UNKNOWN = 'u'; + // appearance locations: (used for flags) + public static final int AP_TITLE = 0; // title tag from html header + public static final int AP_H1 = 1; // h0-tag + public static final int AP_H2 = 2; + public static final int AP_H3 = 3; + public static final int AP_H4 = 4; + public static final int AP_H5 = 5; + public static final int AP_H6 = 6; + public static final int AP_ANCHOR = 7; // anchor description + public static final int AP_URL = 8; // word inside an url + public static final int AP_IMG = 9; // tag inside image references + public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags) + // local flag attributes public static final char LT_LOCAL = 'L'; public static final char LT_GLOBAL = 'G'; @@ -187,23 +202,22 @@ public final class plasmaWordIndexEntry { // the class instantiation can only be done by a plasmaStore method // therefore they are all public public plasmaWordIndexEntry(String urlHash, - int count, // how often appears this word in the text - int posintext, - int posinphrase, - int posofphrase, - long time, - int quality, - String language, - char doctype, + int hitcount, // how often appears this word in the text + int wordcount, // total number of words + int phrasecount, // total number of phrases + int posintext, // position of word in all words + int posinphrase, // position of word in its phrase + int posofphrase, // number of the phrase where word appears + int distance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search + long time, // last-modified time of the document where word appears + int quality, // + String language, // + char doctype, // boolean local) { // more needed attributes: - // - int: length of text / total number of words - // - int: length of text / total number of sentences // - long: update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short - // - int: word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search - // - char: category of appearance (header, title, section, text, anchor-descr, image-tag etc) - // - boolean: appears in title, appears in header, appears in .... + // - boolean: appears in title, appears in header, anchor-descr, image-tag etc // - int: url-length (shorter are better) // - int: url-number of components / length of path // - int: length of description tag / title tag (longer are better) @@ -211,10 +225,13 @@ public final class plasmaWordIndexEntry { if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk"; this.urlHash = urlHash; - this.count = count; + this.hitcount = hitcount; + this.wordcount = wordcount; + this.phrasecount = phrasecount; this.posintext = posintext; this.posinphrase = posinphrase; this.posofphrase = posofphrase; + this.worddistance = distance; this.lastModified = time; this.quality = quality; this.language = language.getBytes(); @@ -225,15 +242,18 @@ public final class plasmaWordIndexEntry { public plasmaWordIndexEntry(String urlHash, String code) { // the code is not parsed but used later on this.urlHash = urlHash; - this.count = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8)); - this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0; - this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0; - this.posofphrase = (code.length() >= 16) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0; + this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8)); this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6))); this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(0, 3)); this.language = code.substring(8, 10).getBytes(); this.doctype = code.charAt(10); this.localflag = code.charAt(11); + this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0; + this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0; + this.posofphrase = (code.length() >= 17) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0; + this.worddistance = (code.length() >= 19) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(18, 20)) : 0; + this.wordcount = (code.length() >= 21) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(20, 22)) : 0; + this.phrasecount = (code.length() >= 23) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(22, 24)) : 0; } public plasmaWordIndexEntry(String external) { @@ -246,10 +266,13 @@ public final class plasmaWordIndexEntry { } // set values this.urlHash = pr.getProperty("h", ""); - this.count = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("c", "A")); + this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("c", "A")); + this.wordcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("w", "__")); + this.phrasecount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("p", "__")); this.posintext = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("t", "__")); this.posinphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("r", "__")); this.posofphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("o", "__")); + this.worddistance = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("i", "__")); this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A"))); this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__")); this.language = pr.getProperty("l", "uk").getBytes(); @@ -260,85 +283,64 @@ public final class plasmaWordIndexEntry { public String toEncodedForm() { // attention: this integrates NOT the URL hash into the encoding // if you need a complete dump, use toExternalForm() - StringBuffer buf = new StringBuffer(attrSpaceLong); + StringBuffer buf = new StringBuffer(attrSpace); buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, plasmaURL.urlQualityLength)) .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3)) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.count, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2)) .append(new String(this.language)) .append(this.doctype) - .append(this.localflag); // 3 + 3 + 2 + 2 + 1 + 1 = 12 bytes - - - buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2)) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2)) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2)); - + .append(this.localflag) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2)); // 3+3+2+2+1+1+2+2+2+2+2+2= 24 bytes + return buf.toString(); - } + } - public String toExternalForm() { + public String toExternalForm() { StringBuffer str = new StringBuffer(61); str.append("{") - .append("h=").append(this.urlHash) + .append( "h=").append(this.urlHash) .append(",q=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, plasmaURL.urlQualityLength)) .append(",a=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3)) - .append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.count, 2)) + .append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2)) .append(",l=").append(new String(this.language)) .append(",d=").append(this.doctype) .append(",f=").append(this.localflag) .append(",t=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2)) .append(",r=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2)) .append(",o=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2)) + .append(",i=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2)) + .append(",w=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2)) + .append(",p=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2)) .append("}"); return str.toString(); - } - - public String getUrlHash() { - return urlHash; - } - - public int getQuality() { - return quality; - } - - public int getVirtualAge() { - return plasmaWordIndex.microDateDays(lastModified); } - public long getLastModified() { - return lastModified; + public void combineDistance(plasmaWordIndexEntry oe) { + this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext); } - public int getCount() { - return count; - } - - public int posintext() { - return posintext; - } - - public int posinphrase() { - return posinphrase; - } - - public int posofphrase() { - return posofphrase; - } - - public String getLanguage() { - return new String(language); - } - - public char getType() { - return doctype; - } - - public boolean isLocal() { - return localflag == LT_LOCAL; - } + public String getUrlHash() { return urlHash; } + public int getQuality() { return quality; } + public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); } + public long getLastModified() { return lastModified; } + public int getCount() { return hitcount; } + public int posintext() { return posintext; } + public int posinphrase() { return posinphrase; } + public int posofphrase() { return posofphrase; } + public int worddistance() { return worddistance; } + public int wordcount() { return wordcount; } + public int phrasecount() { return phrasecount; } + public String getLanguage() { return new String(language); } + public char getType() { return doctype; } + public boolean isLocal() { return localflag == LT_LOCAL; } public static void main(String[] args) { // outputs the word hash to a given word diff --git a/source/de/anomic/server/serverObjects.java b/source/de/anomic/server/serverObjects.java index 1ce462555..507e66622 100644 --- a/source/de/anomic/server/serverObjects.java +++ b/source/de/anomic/server/serverObjects.java @@ -61,13 +61,10 @@ import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; -import java.sql.Date; import java.util.ArrayList; import java.util.Enumeration; import java.util.Hashtable; -import java.util.Iterator; import java.util.Map; -import java.util.Vector; import de.anomic.http.httpHeader; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 1e4af3f96..7eb9831ef 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -59,7 +59,6 @@ import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.plasma.plasmaURLPattern; -import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaSearchProfile; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; @@ -441,8 +440,7 @@ public final class yacyClient { //System.out.println("yacyClient: search result = " + result.toString()); // debug final int results = Integer.parseInt((String) result.get("count")); //System.out.println("***result count " + results); - plasmaCrawlLURL.Entry link; - + // create containers final int words = wordhashes.length() / plasmaWordIndexEntry.wordHashLength; plasmaWordIndexEntryContainer[] container = new plasmaWordIndexEntryContainer[words]; @@ -451,21 +449,31 @@ public final class yacyClient { } // insert results to containers - plasmaCrawlLURL.Entry lEntry; + plasmaCrawlLURL.Entry urlEntry; for (int n = 0; n < results; n++) { // get one single search result - lEntry = urlManager.newEntry((String) result.get("resource" + n), true); - if (lEntry != null && blacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath())) { continue; } // block with backlist - link = urlManager.addEntry(lEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); + urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); + if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist + urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); // save the url entry - final plasmaWordIndexEntry entry = new plasmaWordIndexEntry(link.hash(), link.wordCount(), 0, 0, 0, - plasmaWordIndex.microDateDays(link.moddate()), link.quality(), - link.language(), link.doctype(), false); - if (link.snippet() != null) { + final plasmaWordIndexEntry entry; + if (urlEntry.word() == null) + entry = new plasmaWordIndexEntry( + urlEntry.hash(), + urlEntry.wordCount(), + 0, 0, 0, 0, 0, 0, + urlEntry.moddate().getTime(), + urlEntry.quality(), + urlEntry.language(), + urlEntry.doctype(), + false + ); + else entry = urlEntry.word(); + if (urlEntry.snippet() != null) { // we don't store the snippets along the url entry, because they are search-specific. // instead, they are placed in a snipped-search cache. //System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'"); - snippets.storeToCache(wordhashes, link.hash(), link.snippet()); + snippets.storeToCache(wordhashes, urlEntry.hash(), urlEntry.snippet()); } // add the url entry to the word indexes for (int m = 0; m < words; m++) { diff --git a/source/yacy.java b/source/yacy.java index 87a4e9287..d0235267f 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -754,7 +754,7 @@ public final class yacy { String urlHash = importWordIdxEntry.getUrlHash(); if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try { // importing the new url - plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash); + plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash, null); urlCounter++; homeUrlDB.newEntry(urlEntry); @@ -861,7 +861,7 @@ public final class yacy { wordIdxEntry = (plasmaWordIndexEntry) wordIdxEntries.next(); String urlHash = wordIdxEntry.getUrlHash(); if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { - plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash); + plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null); urlCounter++; /*plasmaCrawlLURL.Entry newEntry =*/ minimizedUrlDB.newEntry(urlEntry); if (urlCounter % 500 == 0) {