* added more ranking attributes (without function; this will be added later)

* added ranking coefficient transmission to remote peer (without evaluation on server side, will be added later)
* changed ranking coefficients slightly

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1770 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 4797b3e50a
commit eaffcfefe2

@ -472,7 +472,7 @@ public class dir {
);
final String urlHash = newEntry.hash();
/*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, condenser, "**", plasmaWordIndexEntry.DT_SHARE);
/*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, null, condenser, "**", plasmaWordIndexEntry.DT_SHARE, 0, 0);
} catch (IOException e) {}
}

@ -406,6 +406,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes
// - boolean: appearance of bold and/or italics
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain

@ -183,10 +183,10 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
if (fetchpeers < 10) fetchpeers = 10;
log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS");
long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000;
searchThreads = yacySearch.searchHashes(query.queryHashes, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal);
searchThreads = yacySearch.searchHashes(query.queryHashes, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking);
// wait until wanted delay passed or wanted result appeared
while (System.currentTimeMillis() < timeout) {
// check if all threads have been finished or results so far are enough

@ -118,9 +118,9 @@ public class plasmaSearchRankingProfile {
this.order = order;
// overwrite defaults with order attributes
for (int i = 0; i < 3; i++) {
if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_QUALITY)) coeff.put(ENTROPY, new Integer((4 * (3 - i))));
else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_DATE)) coeff.put(DATE, new Integer((4 * (3 - i))));
else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_YBR)) coeff.put(YBR, new Integer((4 * (3 - i))));
if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_QUALITY)) coeff.put(ENTROPY, new Integer((3 * (3 - i))));
else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_DATE)) coeff.put(DATE, new Integer((3 * (3 - i))));
else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_YBR)) coeff.put(YBR, new Integer((3 * (3 - i))));
}
}

@ -1351,7 +1351,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText()));
// generate citation reference
generateCitationReference(entry.urlHash(), docDate, document, condenser);
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser);
//log.logInfo("INDEXING HEADLINE:" + descr);
try {
@ -1388,7 +1388,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (((storagePeerHash = getConfig("storagePeerHash",null))== null) ||
(storagePeerHash.trim().length() == 0) ||
((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), document, condenser,
plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()),
ioLinks[0].intValue(), ioLinks[1].intValue());
} else {
HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry);
@ -1397,7 +1399,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
int urlLength = newEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length;
// iterate over all words
Iterator i = condenser.words();
Map.Entry wentry;
@ -1411,6 +1413,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps,
wordStat.count,
document.longTitle.length(),
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
wordStat.posInText,
@ -1423,6 +1426,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
ioLinks[0].intValue(),
ioLinks[1].intValue(),
true);
wordIdxContainer.add(wordIdxEntry);
tmpContainers.add(wordIdxContainer);
@ -1440,7 +1445,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
120000);
if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(),
document, condenser,
plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()),
ioLinks[0].intValue(), ioLinks[1].intValue());
}
tmpContainers = null;
@ -1510,7 +1519,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
private void generateCitationReference(String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) {
private Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) {
// generate citation reference
Map hl = document.getHyperlinks();
Iterator it = hl.entrySet().iterator();
@ -1561,6 +1570,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
flushCitationReference(crg, "crg");
crg = new StringBuffer(maxCRGDump);
}
return new Integer[] {new Integer(LCount), new Integer(GCount)};
}
private void flushCitationReference(StringBuffer cr, String type) {

@ -211,7 +211,7 @@ public final class plasmaWordIndex {
return ((long) microDateDays) * ((long) day);
}
public synchronized int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) {
public synchronized int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaParserDocument document, plasmaCondenser condenser, String language, char doctype, int outlinksSame, int outlinksOther) {
// this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index entries
@ -232,7 +232,7 @@ public final class plasmaWordIndex {
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaWordIndexEntry.word2hash(word);
ientry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps,
urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(),
wprop.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
@ -246,6 +246,7 @@ public final class plasmaWordIndex {
condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
outlinksSame, outlinksOther,
true);
addEntry(wordHash, ientry, System.currentTimeMillis(), false);
//addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), System.currentTimeMillis(), false);

@ -101,12 +101,12 @@ public final class plasmaWordIndexEntry implements Cloneable {
// appearance locations: (used for flags)
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // h1-tag
public static final int AP_H2 = 2; // h2-tag
public static final int AP_H3 = 3; // h3-tag
public static final int AP_H4 = 4; // h4-tag
public static final int AP_H5 = 5; // h5-tag
public static final int AP_H6 = 6; // h6-tag
public static final int AP_H1 = 1; // headline - top level
public static final int AP_H2 = 2; // headline, second level
public static final int AP_H3 = 3; // headline, 3rd level
public static final int AP_H4 = 4; // headline, 4th level
public static final int AP_H5 = 5; // headline, 5th level
public static final int AP_H6 = 6; // headline, 6th level
public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
public static final int AP_DOM = 8; // word inside an url: in Domain
public static final int AP_PATH = 9; // word inside an url: in path
@ -218,6 +218,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
public plasmaWordIndexEntry(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words
int phrasecount, //*total number of phrases
@ -231,15 +232,14 @@ public final class plasmaWordIndexEntry implements Cloneable {
int quality, //*the entropy value
String language, //*(guessed) language of document
char doctype, //*type of document
int outlinksSame, // outlinks to same domain
int outlinksOther,// outlinks to other domain
boolean local //*flag shows that this index was generated locally; othervise its from a remote peer
) {
// more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
// - boolean: URL attributes
// - int: length of description tag / title tag (longer are better)
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: # of keywords
if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";

@ -55,6 +55,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntry;
@ -357,7 +358,8 @@ public final class yacyClient {
plasmaWordIndexEntryContainer containerCache,
plasmaURLPattern blacklist,
plasmaSnippetCache snippets,
plasmaSearchTimingProfile profile
plasmaSearchTimingProfile timingProfile,
plasmaSearchRankingProfile rankingProfile
) {
// send a search request to peer with remote Hash
// this mainly converts the words into word hashes
@ -395,17 +397,18 @@ public final class yacyClient {
"&query=" + wordhashes;
*/
final serverObjects obj = new serverObjects(9);
long duetime = profile.duetime();
long duetime = timingProfile.duetime();
obj.put("myseed", yacyCore.seedDB.mySeed.genSeedStr(key));
obj.put("youare", targetPeer.hash);
obj.put("key", key);
obj.put("count", profile.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT));
obj.put("count", timingProfile.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT));
obj.put("resource", ((global) ? "global" : "local"));
obj.put("query", wordhashes);
obj.put("ttl", "0");
obj.put("duetime", Long.toString(duetime));
obj.put("profile", profile.targetToString()); // new duetimes splitted by specific search tasks
obj.put("profile", timingProfile.targetToString()); // new duetimes splitted by specific search tasks
obj.put("maxdist", maxDistance);
obj.put("rankingProfile", rankingProfile.toExternalString());
obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date()));
//yacyCore.log.logDebug("yacyClient.search url=" + url);
@ -427,7 +430,7 @@ public final class yacyClient {
// compute all computation times
final long totalrequesttime = System.currentTimeMillis() - timestamp;
String returnProfile = (String) result.get("profile");
if (returnProfile != null) profile.putYield(returnProfile);
if (returnProfile != null) timingProfile.putYield(returnProfile);
/*
HashMap result = nxTools.table(httpc.wget(new URL(url),
@ -474,6 +477,7 @@ public final class yacyClient {
entry = new plasmaWordIndexEntry(
urlEntry.hash(),
urlLength, urlComps,
urlEntry.descr().length(),
urlEntry.wordCount(),
0, 0, 0, 0, 0, 0,
urlEntry.size(),
@ -482,6 +486,7 @@ public final class yacyClient {
urlEntry.quality(),
urlEntry.language(),
urlEntry.doctype(),
0,0,
false
);
} else {

@ -50,6 +50,7 @@ import java.util.HashMap;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaURLPattern;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSearchTimingProfile;
@ -67,10 +68,12 @@ public class yacySearch extends Thread {
final private yacySeed targetPeer;
private int links;
private int maxDistance;
final private plasmaSearchTimingProfile profile;
final private plasmaSearchTimingProfile timingProfile;
final private plasmaSearchRankingProfile rankingProfile;
public yacySearch(Set wordhashes, int maxDistance, boolean global, yacySeed targetPeer,
plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile profile) {
plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) {
super("yacySearch_" + targetPeer.getName());
this.wordhashes = wordhashes;
this.global = global;
@ -81,11 +84,12 @@ public class yacySearch extends Thread {
this.targetPeer = targetPeer;
this.links = -1;
this.maxDistance = maxDistance;
this.profile = (plasmaSearchTimingProfile) profile.clone();
this.timingProfile = (plasmaSearchTimingProfile) timingProfile.clone();
this.rankingProfile = rankingProfile;
}
public void run() {
this.links = yacyClient.search(set2string(wordhashes), maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, profile);
this.links = yacyClient.search(set2string(wordhashes), maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, timingProfile, rankingProfile);
if (links != 0) {
//yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes);
yacyCore.seedDB.mySeed.incRI(links);
@ -104,8 +108,8 @@ public class yacySearch extends Thread {
return this.links;
}
public plasmaSearchTimingProfile profile() {
return this.profile;
public plasmaSearchTimingProfile timingProfile() {
return this.timingProfile;
}
public yacySeed target() {
@ -175,7 +179,8 @@ public class yacySearch extends Thread {
}
public static yacySearch[] searchHashes(Set wordhashes, int maxDist, plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache,
int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile profile) {
int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) {
// check own peer status
if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; }
@ -186,9 +191,9 @@ public class yacySearch extends Thread {
targets = targetPeers.length;
if (targets == 0) return null;
yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) {
for (int i = 0; i < targets; i++) {
searchThreads[i]= new yacySearch(wordhashes, maxDist, true, targetPeers[i],
urlManager, containerCache, blacklist, snippetCache, profile);
urlManager, containerCache, blacklist, snippetCache, timingProfile, rankingProfile);
searchThreads[i].start();
try {Thread.sleep(20);} catch (InterruptedException e) {}

Loading…
Cancel
Save