diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index 02972d891..c50f10593 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -472,7 +472,7 @@ public class dir { ); final String urlHash = newEntry.hash(); - /*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), condenser, "**", plasmaWordIndexEntry.DT_SHARE); + /*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, condenser, "**", plasmaWordIndexEntry.DT_SHARE); } catch (IOException e) {} } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 462957c8a..9bf2408a1 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -135,7 +135,7 @@ public final class plasmaCrawlLURL extends plasmaURL { String initiatorHash, String executorHash, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, - long size, int wordCount, int stackType) { + int size, int wordCount, int stackType) { Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount); if (initiatorHash == null) { initiatorHash = dummyHash; } if (executorHash == null) { executorHash = dummyHash; } @@ -396,12 +396,12 @@ public final class plasmaCrawlLURL extends plasmaURL { private int quality; private String language; private char doctype; - private long size; + private int size; private int wordCount; private String snippet; private plasmaWordIndexEntry word; - public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, long size, int wordCount) { + public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) { // create new entry and store it into database this.urlHash = urlHash(url); this.url = url; @@ -444,7 +444,7 @@ public final class plasmaCrawlLURL extends plasmaURL { this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[8], "UTF-8")); this.language = new String(entry[9], "UTF-8"); this.doctype = (char) entry[10][0]; - this.size = kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[11], "UTF-8")); + this.size = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[11], "UTF-8")); this.wordCount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[12], "UTF-8")); this.snippet = null; this.word = searchedWord; @@ -477,7 +477,7 @@ public final class plasmaCrawlLURL extends plasmaURL { this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", "")); this.language = prop.getProperty("lang", "uk"); this.doctype = prop.getProperty("dt", "t").charAt(0); - this.size = Long.parseLong(prop.getProperty("size", "0")); + this.size = Integer.parseInt(prop.getProperty("size", "0")); this.wordCount = Integer.parseInt(prop.getProperty("wc", "0")); this.snippet = prop.getProperty("snippet", ""); if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); @@ -570,7 +570,7 @@ public final class plasmaCrawlLURL extends plasmaURL { return language; } - public long size() { + public int size() { return size; } diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index c7011ba08..b8fac651a 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -132,14 +132,14 @@ public final class plasmaSearchPreOrder { long factor = 4096L*4096L; for (int i = 0; i < 3; i++) { - if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = factor * indexEntry.getQuality() / 64L; - else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking = factor * indexEntry.getVirtualAge() / 64L; - else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking = factor * ybr_p(indexEntry.getUrlHash()); + if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += factor * indexEntry.getQuality() / 64L; + else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking += factor * indexEntry.getVirtualAge() / 64L; + else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * ybr_p(indexEntry.getUrlHash()); factor = factor / 4096L; } int wordpos = indexEntry.posintext(); if (wordpos == 0) wordpos = 1000; - ranking = ranking + 1000 - wordpos + indexEntry.hitcount(); + ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance()); pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry); } diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 1e53c4502..164805f6d 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -159,7 +159,7 @@ public final class plasmaSearchResult { } int wordpos = indexEntry.posintext(); if (wordpos == 0) wordpos = 1000; - ranking = ranking + 1000 - wordpos + indexEntry.hitcount(); + ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance()); // apply 'common-sense' heuristic using references for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 1e6281443..31a04422b 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -820,25 +820,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return false; } - if (wordIndex.wordCacheRAMSize() + 1000 > (int) getConfigLong("wordCacheMaxLow", 8000)) { - log.logFine("deQueue: word index ram cache too full (" + ((int) getConfigLong("wordCacheMaxLow", 8000) - wordIndex.wordCacheRAMSize()) + - " slots left); dismissed to omit ram flush lock"); - return false; - } - - int stackCrawlQueueSize; - if ((stackCrawlQueueSize = sbStackCrawlThread.size()) >= stackCrawlSlots) { - log.logFine("deQueue: too many processes in stack crawl thread queue, dismissed to protect emergency case (" + - "stackCrawlQueue=" + stackCrawlQueueSize + ")"); - return false; - } - - plasmaSwitchboardQueue.Entry nextentry; synchronized (sbQueue) { + if (sbQueue.size() == 0) { - //log.logDebug("DEQUEUE: queue is empty"); + // log.logDebug("DEQUEUE: queue is empty"); return false; // nothing to do } + + if (wordIndex.wordCacheRAMSize() + 1000 > (int) getConfigLong("wordCacheMaxLow", 8000)) { + log.logFine("deQueue: word index ram cache too full (" + ((int) getConfigLong("wordCacheMaxLow", 8000) - wordIndex.wordCacheRAMSize()) + " slots left); dismissed to omit ram flush lock"); + return false; + } + + int stackCrawlQueueSize; + if ((stackCrawlQueueSize = sbStackCrawlThread.size()) >= stackCrawlSlots) { + log.logFine("deQueue: too many processes in stack crawl thread queue, dismissed to protect emergency case (" + "stackCrawlQueue=" + stackCrawlQueueSize + ")"); + return false; + } + + plasmaSwitchboardQueue.Entry nextentry; // if we were interrupted we should return now if (Thread.currentThread().isInterrupted()) return false; @@ -856,13 +856,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logSevere("IOError in plasmaSwitchboard.deQueue: " + e.getMessage(), e); return false; } - } - synchronized (this.indexingTasksInProcess) { - this.indexingTasksInProcess.put(nextentry.urlHash(),nextentry); + synchronized (this.indexingTasksInProcess) { + this.indexingTasksInProcess.put(nextentry.urlHash(), nextentry); + } + + processResourceStack(nextentry); } - - processResourceStack(nextentry); return true; } @@ -1288,7 +1288,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser condenser.RESULT_INFORMATION_VALUE, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()), - entry.size(), + (int) entry.size(), condenser.RESULT_NUMB_WORDS, processCase ); @@ -1309,7 +1309,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (((storagePeerHash = getConfig("storagePeerHash",null))== null) || (storagePeerHash.trim().length() == 0) || ((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){ - words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); + words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); } else { HashMap urlCache = new HashMap(1); urlCache.put(newEntry.hash(),newEntry); @@ -1341,7 +1341,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser wordStat.posInPhrase, wordStat.numOfPhrase, 0, + newEntry.size(), docDate.getTime(), + System.currentTimeMillis(), quality, language, doctype, true); wordIdxEntity.addEntry(wordIdxEntry); tmpEntities.add(wordIdxEntity); @@ -1354,7 +1356,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000); if (error != null) { - words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); + words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); } // cleanup diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 7fd086969..2b9953f77 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -49,20 +49,15 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.Comparator; import java.util.Iterator; import java.util.Map; -import java.util.TreeSet; import java.util.HashSet; import java.util.Set; import java.util.Date; import java.net.URL; import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.server.logging.serverLog; -import de.anomic.yacy.yacySeedDB; public final class plasmaWordIndex { @@ -139,7 +134,7 @@ public final class plasmaWordIndex { return ((long) microDateDays) * ((long) day); } - public int addPageIndex(URL url, String urlHash, Date urlModified, plasmaCondenser condenser, String language, char doctype) { + public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) { // this is called by the switchboard to put in a new page into the index // use all the words in one condenser object to simultanous create index // entries @@ -172,7 +167,9 @@ public final class plasmaWordIndex { wprop.posInPhrase, wprop.numOfPhrase, 0, + size, urlModified.getTime(), + System.currentTimeMillis(), quality, language, doctype, true); addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false); } diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index 989443b9d..e65d3c136 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -107,10 +107,16 @@ public final class plasmaWordIndexEntry { public static final int AP_H4 = 4; // h4-tag public static final int AP_H5 = 5; // h5-tag public static final int AP_H6 = 6; // h6-tag - public static final int AP_ANCHOR = 7; // anchor description + public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam) public static final int AP_URL = 8; // word inside an url public static final int AP_IMG = 9; // tag inside image references public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags) + public static final int AP_ANCHOR = 11; // anchor description + + // URL attributes + public static final int UA_LOCAL = 0; // URL was crawled locally + public static final int UA_TILDE = 1; // tilde appears in URL + public static final int UA_REDIRECT = 2; // The URL is a redirection // local flag attributes public static final char LT_LOCAL = 'L'; @@ -201,27 +207,35 @@ public final class plasmaWordIndexEntry { // the class instantiation can only be done by a plasmaStore method // therefore they are all public - public plasmaWordIndexEntry(String urlHash, - int hitcount, // how often appears this word in the text - int wordcount, // total number of words - int phrasecount, // total number of phrases - int posintext, // position of word in all words - int posinphrase, // position of word in its phrase - int posofphrase, // number of the phrase where word appears - int distance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search - long lastmodified, // last-modified time of the document where word appears - int quality, // - String language, // - char doctype, // - boolean local) { + public plasmaWordIndexEntry(String urlHash, + int hitcount, //*how often appears this word in the text + int wordcount, //*total number of words + int phrasecount, //*total number of phrases + int posintext, //*position of word in all words + int posinphrase, //*position of word in its phrase + int posofphrase, //*number of the phrase where word appears + int distance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search + int sizeOfPage, // # of bytes of the page + long lastmodified, //*last-modified time of the document where word appears + long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short + int quality, //*the entropy value + String language, //*(guessed) language of document + char doctype, //*type of document + boolean local //*flag shows that this index was generated locally; othervise its from a remote peer + ) { // more needed attributes: - // - long: update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short - // - boolean: appears in title, appears in header, anchor-descr, image-tag etc + // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc + // - boolean: URL attributes // - int: url-length (shorter are better) // - int: url-number of components / length of path // - int: length of description tag / title tag (longer are better) // - int: number of chapters + // - int: # of outlinks to same domain + // - int: # of outlinks to outside domain + // - int: length of description + // - int: length of title + // - int: # of keywords if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk"; this.urlHash = urlHash; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index b4e2c6ffd..e6f6d05e2 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -469,7 +469,9 @@ public final class yacyClient { urlEntry.hash(), urlEntry.wordCount(), 0, 0, 0, 0, 0, 0, + urlEntry.size(), urlEntry.moddate().getTime(), + System.currentTimeMillis(), urlEntry.quality(), urlEntry.language(), urlEntry.doctype(),