diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index 5ce7bba72..e938ac0ae 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -472,7 +472,7 @@ public class dir { ); final String urlHash = newEntry.hash(); - /*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, condenser, "**", plasmaWordIndexEntry.DT_SHARE); + /*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, null, condenser, "**", plasmaWordIndexEntry.DT_SHARE, 0, 0); } catch (IOException e) {} } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 6b44693a1..946696498 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -406,6 +406,7 @@ public final class plasmaCrawlLURL extends plasmaURL { // - keywords // - phrasecount, total number of phrases // - boolean: URL attributes + // - boolean: appearance of bold and/or italics // - int: # of outlinks to same domain // - int: # of outlinks to outside domain diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 101753600..ab2d01417 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -183,10 +183,10 @@ public final class plasmaSearchEvent extends Thread implements Runnable { if (fetchpeers < 10) fetchpeers = 10; log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS"); - + long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000; - searchThreads = yacySearch.searchHashes(query.queryHashes, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal); - + searchThreads = yacySearch.searchHashes(query.queryHashes, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking); + // wait until wanted delay passed or wanted result appeared while (System.currentTimeMillis() < timeout) { // check if all threads have been finished or results so far are enough diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 169df6865..e172bdb0f 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -118,9 +118,9 @@ public class plasmaSearchRankingProfile { this.order = order; // overwrite defaults with order attributes for (int i = 0; i < 3; i++) { - if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_QUALITY)) coeff.put(ENTROPY, new Integer((4 * (3 - i)))); - else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_DATE)) coeff.put(DATE, new Integer((4 * (3 - i)))); - else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_YBR)) coeff.put(YBR, new Integer((4 * (3 - i)))); + if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_QUALITY)) coeff.put(ENTROPY, new Integer((3 * (3 - i)))); + else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_DATE)) coeff.put(DATE, new Integer((3 * (3 - i)))); + else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_YBR)) coeff.put(YBR, new Integer((3 * (3 - i)))); } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 6f5da0e53..4068b12dd 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1351,7 +1351,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText())); // generate citation reference - generateCitationReference(entry.urlHash(), docDate, document, condenser); + Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); //log.logInfo("INDEXING HEADLINE:" + descr); try { @@ -1388,7 +1388,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (((storagePeerHash = getConfig("storagePeerHash",null))== null) || (storagePeerHash.trim().length() == 0) || ((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){ - words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); + words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), document, condenser, + plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()), + ioLinks[0].intValue(), ioLinks[1].intValue()); } else { HashMap urlCache = new HashMap(1); urlCache.put(newEntry.hash(),newEntry); @@ -1397,7 +1399,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser char doctype = plasmaWordIndexEntry.docType(document.getMimeType()); int urlLength = newEntry.url().toString().length(); int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length; - + // iterate over all words Iterator i = condenser.words(); Map.Entry wentry; @@ -1411,6 +1413,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash, urlLength, urlComps, wordStat.count, + document.longTitle.length(), condenser.RESULT_SIMI_WORDS, condenser.RESULT_SIMI_SENTENCES, wordStat.posInText, @@ -1423,6 +1426,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser condenser.RESULT_WORD_ENTROPHY, language, doctype, + ioLinks[0].intValue(), + ioLinks[1].intValue(), true); wordIdxContainer.add(wordIdxEntry); tmpContainers.add(wordIdxContainer); @@ -1440,7 +1445,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser 120000); if (error != null) { - words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); + words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), + document, condenser, + plasmaWordIndexEntry.language(entry.url()), + plasmaWordIndexEntry.docType(document.getMimeType()), + ioLinks[0].intValue(), ioLinks[1].intValue()); } tmpContainers = null; @@ -1510,7 +1519,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } - private void generateCitationReference(String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) { + private Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) { // generate citation reference Map hl = document.getHyperlinks(); Iterator it = hl.entrySet().iterator(); @@ -1561,6 +1570,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser flushCitationReference(crg, "crg"); crg = new StringBuffer(maxCRGDump); } + + return new Integer[] {new Integer(LCount), new Integer(GCount)}; } private void flushCitationReference(StringBuffer cr, String type) { diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index c9ec88d97..51abacbcf 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -211,7 +211,7 @@ public final class plasmaWordIndex { return ((long) microDateDays) * ((long) day); } - public synchronized int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) { + public synchronized int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaParserDocument document, plasmaCondenser condenser, String language, char doctype, int outlinksSame, int outlinksOther) { // this is called by the switchboard to put in a new page into the index // use all the words in one condenser object to simultanous create index entries @@ -232,7 +232,7 @@ public final class plasmaWordIndex { // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); wordHash = plasmaWordIndexEntry.word2hash(word); ientry = new plasmaWordIndexEntry(urlHash, - urlLength, urlComps, + urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(), wprop.count, condenser.RESULT_SIMI_WORDS, condenser.RESULT_SIMI_SENTENCES, @@ -246,6 +246,7 @@ public final class plasmaWordIndex { condenser.RESULT_WORD_ENTROPHY, language, doctype, + outlinksSame, outlinksOther, true); addEntry(wordHash, ientry, System.currentTimeMillis(), false); //addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), System.currentTimeMillis(), false); diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index cbad0d318..b340bb3a3 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -101,12 +101,12 @@ public final class plasmaWordIndexEntry implements Cloneable { // appearance locations: (used for flags) public static final int AP_TITLE = 0; // title tag from html header - public static final int AP_H1 = 1; // h1-tag - public static final int AP_H2 = 2; // h2-tag - public static final int AP_H3 = 3; // h3-tag - public static final int AP_H4 = 4; // h4-tag - public static final int AP_H5 = 5; // h5-tag - public static final int AP_H6 = 6; // h6-tag + public static final int AP_H1 = 1; // headline - top level + public static final int AP_H2 = 2; // headline, second level + public static final int AP_H3 = 3; // headline, 3rd level + public static final int AP_H4 = 4; // headline, 4th level + public static final int AP_H5 = 5; // headline, 5th level + public static final int AP_H6 = 6; // headline, 6th level public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam) public static final int AP_DOM = 8; // word inside an url: in Domain public static final int AP_PATH = 9; // word inside an url: in path @@ -218,6 +218,7 @@ public final class plasmaWordIndexEntry implements Cloneable { public plasmaWordIndexEntry(String urlHash, int urlLength, // byte-length of complete URL int urlComps, // number of path components + int titleLength, // length of description/length (longer are better?) int hitcount, //*how often appears this word in the text int wordcount, //*total number of words int phrasecount, //*total number of phrases @@ -231,15 +232,14 @@ public final class plasmaWordIndexEntry implements Cloneable { int quality, //*the entropy value String language, //*(guessed) language of document char doctype, //*type of document + int outlinksSame, // outlinks to same domain + int outlinksOther,// outlinks to other domain boolean local //*flag shows that this index was generated locally; othervise its from a remote peer ) { // more needed attributes: // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc // - boolean: URL attributes - // - int: length of description tag / title tag (longer are better) - // - int: # of outlinks to same domain - // - int: # of outlinks to outside domain // - int: # of keywords if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk"; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index d1bb435e2..cf0a3a57d 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -55,6 +55,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpc; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexEntry; @@ -357,7 +358,8 @@ public final class yacyClient { plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippets, - plasmaSearchTimingProfile profile + plasmaSearchTimingProfile timingProfile, + plasmaSearchRankingProfile rankingProfile ) { // send a search request to peer with remote Hash // this mainly converts the words into word hashes @@ -395,17 +397,18 @@ public final class yacyClient { "&query=" + wordhashes; */ final serverObjects obj = new serverObjects(9); - long duetime = profile.duetime(); + long duetime = timingProfile.duetime(); obj.put("myseed", yacyCore.seedDB.mySeed.genSeedStr(key)); obj.put("youare", targetPeer.hash); obj.put("key", key); - obj.put("count", profile.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT)); + obj.put("count", timingProfile.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT)); obj.put("resource", ((global) ? "global" : "local")); obj.put("query", wordhashes); obj.put("ttl", "0"); obj.put("duetime", Long.toString(duetime)); - obj.put("profile", profile.targetToString()); // new duetimes splitted by specific search tasks + obj.put("profile", timingProfile.targetToString()); // new duetimes splitted by specific search tasks obj.put("maxdist", maxDistance); + obj.put("rankingProfile", rankingProfile.toExternalString()); obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date())); //yacyCore.log.logDebug("yacyClient.search url=" + url); @@ -427,7 +430,7 @@ public final class yacyClient { // compute all computation times final long totalrequesttime = System.currentTimeMillis() - timestamp; String returnProfile = (String) result.get("profile"); - if (returnProfile != null) profile.putYield(returnProfile); + if (returnProfile != null) timingProfile.putYield(returnProfile); /* HashMap result = nxTools.table(httpc.wget(new URL(url), @@ -474,6 +477,7 @@ public final class yacyClient { entry = new plasmaWordIndexEntry( urlEntry.hash(), urlLength, urlComps, + urlEntry.descr().length(), urlEntry.wordCount(), 0, 0, 0, 0, 0, 0, urlEntry.size(), @@ -482,6 +486,7 @@ public final class yacyClient { urlEntry.quality(), urlEntry.language(), urlEntry.doctype(), + 0,0, false ); } else { diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 79421bbd5..f1254ef5f 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -50,6 +50,7 @@ import java.util.HashMap; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaURLPattern; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSearchTimingProfile; @@ -67,10 +68,12 @@ public class yacySearch extends Thread { final private yacySeed targetPeer; private int links; private int maxDistance; - final private plasmaSearchTimingProfile profile; - + final private plasmaSearchTimingProfile timingProfile; + final private plasmaSearchRankingProfile rankingProfile; + public yacySearch(Set wordhashes, int maxDistance, boolean global, yacySeed targetPeer, - plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile profile) { + plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, + plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { super("yacySearch_" + targetPeer.getName()); this.wordhashes = wordhashes; this.global = global; @@ -81,11 +84,12 @@ public class yacySearch extends Thread { this.targetPeer = targetPeer; this.links = -1; this.maxDistance = maxDistance; - this.profile = (plasmaSearchTimingProfile) profile.clone(); + this.timingProfile = (plasmaSearchTimingProfile) timingProfile.clone(); + this.rankingProfile = rankingProfile; } public void run() { - this.links = yacyClient.search(set2string(wordhashes), maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, profile); + this.links = yacyClient.search(set2string(wordhashes), maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, timingProfile, rankingProfile); if (links != 0) { //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes); yacyCore.seedDB.mySeed.incRI(links); @@ -104,8 +108,8 @@ public class yacySearch extends Thread { return this.links; } - public plasmaSearchTimingProfile profile() { - return this.profile; + public plasmaSearchTimingProfile timingProfile() { + return this.timingProfile; } public yacySeed target() { @@ -175,7 +179,8 @@ public class yacySearch extends Thread { } public static yacySearch[] searchHashes(Set wordhashes, int maxDist, plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, - int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile profile) { + int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, + plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { // check own peer status if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; } @@ -186,9 +191,9 @@ public class yacySearch extends Thread { targets = targetPeers.length; if (targets == 0) return null; yacySearch[] searchThreads = new yacySearch[targets]; - for (int i = 0; i < targets; i++) { + for (int i = 0; i < targets; i++) { searchThreads[i]= new yacySearch(wordhashes, maxDist, true, targetPeers[i], - urlManager, containerCache, blacklist, snippetCache, profile); + urlManager, containerCache, blacklist, snippetCache, timingProfile, rankingProfile); searchThreads[i].start(); try {Thread.sleep(20);} catch (InterruptedException e) {}