From 85dc62c16fbe61f9c9c2dd6ace000ee53127898d Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 22 Jan 2008 19:03:47 +0000 Subject: [PATCH] refactoring: more dublin core - compliant naming git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4354 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControlRWIs_p.java | 55 ++++++++++--------- source/de/anomic/index/indexRWIEntry.java | 11 ++-- .../de/anomic/index/indexRWIEntryOrder.java | 10 ++-- source/de/anomic/plasma/plasmaCondenser.java | 17 +++--- 4 files changed, 48 insertions(+), 45 deletions(-) diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 88c518e09..bbf12c3fe 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -39,6 +39,7 @@ import de.anomic.data.listManager; import de.anomic.http.httpHeader; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIRowEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBitfield; import de.anomic.plasma.plasmaCondenser; @@ -115,11 +116,11 @@ public class IndexControlRWIs_p { // generate an urlx array indexContainer index = null; index = sb.wordIndex.getContainer(keyhash, null); - Iterator en = index.entries(); + Iterator en = index.entries(); int i = 0; urlx = new String[index.size()]; while (en.hasNext()) { - urlx[i++] = ((indexRWIEntry) en.next()).urlHash(); + urlx[i++] = en.next().urlHash(); } index = null; } @@ -146,7 +147,7 @@ public class IndexControlRWIs_p { sb.urlRemove(urlx[i]); } } - Set urlHashes = new HashSet(); + Set urlHashes = new HashSet(); for (int i = 0; i < urlx.length; i++) urlHashes.add(urlx[i]); sb.wordIndex.removeEntries(keyhash, urlHashes); // this shall lead to a presentation of the list; so handle that the remaining program @@ -193,13 +194,13 @@ public class IndexControlRWIs_p { long starttime = System.currentTimeMillis(); index = sb.wordIndex.getContainer(keyhash, null); // built urlCache - Iterator urlIter = index.entries(); - HashMap knownURLs = new HashMap(); - HashSet unknownURLEntries = new HashSet(); + Iterator urlIter = index.entries(); + HashMap knownURLs = new HashMap(); + HashSet unknownURLEntries = new HashSet(); indexRWIEntry iEntry; indexURLEntry lurl; while (urlIter.hasNext()) { - iEntry = (indexRWIEntry) urlIter.next(); + iEntry = urlIter.next(); lurl = sb.wordIndex.loadedURL.load(iEntry.urlHash(), null, 0); if (lurl == null) { unknownURLEntries.add(iEntry.urlHash()); @@ -212,7 +213,7 @@ public class IndexControlRWIs_p { // transport to other peer String gzipBody = sb.getConfig("indexControl.gzipBody","false"); int timeout = (int) sb.getConfigLong("indexControl.timeout",60000); - HashMap resultObj = yacyClient.transferIndex( + HashMap resultObj = yacyClient.transferIndex( seed, new indexContainer[]{index}, knownURLs, @@ -225,7 +226,7 @@ public class IndexControlRWIs_p { // generate list if (post.containsKey("keyhashsimilar")) { - final Iterator containerIt = sb.wordIndex.indexContainerSet(keyhash, false, true, 256).iterator(); + final Iterator containerIt = sb.wordIndex.indexContainerSet(keyhash, false, true, 256).iterator(); indexContainer container; int i = 0; int rows = 0, cols = 0; @@ -248,7 +249,7 @@ public class IndexControlRWIs_p { if (post.containsKey("blacklist")) { String blacklist = post.get("blacklist", ""); - Set urlHashes = new HashSet(); + Set urlHashes = new HashSet(); if (post.containsKey("blacklisturls")) { PrintWriter pw; try { @@ -323,11 +324,11 @@ public class IndexControlRWIs_p { if (post.get("flags","").length() == 0) return null; return new kelondroBitfield(4, (String) post.get("flags")); } - if (post.get("reference", "").equals("on")) b.set(indexRWIEntry.flag_app_reference, true); - if (post.get("description", "").equals("on")) b.set(indexRWIEntry.flag_app_descr, true); - if (post.get("author", "").equals("on")) b.set(indexRWIEntry.flag_app_author, true); - if (post.get("tag", "").equals("on")) b.set(indexRWIEntry.flag_app_tags, true); - if (post.get("url", "").equals("on")) b.set(indexRWIEntry.flag_app_url, true); + if (post.get("reference", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_description, true); + if (post.get("description", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_title, true); + if (post.get("author", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_creator, true); + if (post.get("tag", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_subject, true); + if (post.get("url", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_identifier, true); if (post.get("emphasized", "").equals("on")) b.set(indexRWIEntry.flag_app_emphasized, true); if (post.get("image", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasimage, true); if (post.get("audio", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasaudio, true); @@ -343,7 +344,7 @@ public class IndexControlRWIs_p { int hc = 0; prop.put("searchresult_keyhash", startHash); if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) { - Iterator e = yacyCore.dhtAgent.getAcceptRemoteIndexSeeds(startHash); + Iterator e = yacyCore.dhtAgent.getAcceptRemoteIndexSeeds(startHash); while (e.hasNext()) { seed = (yacySeed) e.next(); if (seed != null) { @@ -369,11 +370,11 @@ public class IndexControlRWIs_p { } else { prop.put("searchresult", 3); prop.put("searchresult_allurl", ranked.filteredCount()); - prop.put("searchresult_reference", ranked.flagCount()[indexRWIEntry.flag_app_reference]); - prop.put("searchresult_description", ranked.flagCount()[indexRWIEntry.flag_app_descr]); - prop.put("searchresult_author", ranked.flagCount()[indexRWIEntry.flag_app_author]); - prop.put("searchresult_tag", ranked.flagCount()[indexRWIEntry.flag_app_tags]); - prop.put("searchresult_url", ranked.flagCount()[indexRWIEntry.flag_app_url]); + prop.put("searchresult_reference", ranked.flagCount()[indexRWIEntry.flag_app_dc_description]); + prop.put("searchresult_description", ranked.flagCount()[indexRWIEntry.flag_app_dc_title]); + prop.put("searchresult_author", ranked.flagCount()[indexRWIEntry.flag_app_dc_creator]); + prop.put("searchresult_tag", ranked.flagCount()[indexRWIEntry.flag_app_dc_subject]); + prop.put("searchresult_url", ranked.flagCount()[indexRWIEntry.flag_app_dc_identifier]); prop.put("searchresult_emphasized", ranked.flagCount()[indexRWIEntry.flag_app_emphasized]); prop.put("searchresult_image", ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]); prop.put("searchresult_audio", ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]); @@ -439,11 +440,11 @@ public class IndexControlRWIs_p { ((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_url)) ? "appears in url, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_descr)) ? "appears in description, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_author)) ? "appears in author, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_tags)) ? "appears in tags, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_reference)) ? "appears in reference, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_dc_identifier)) ? "appears in url, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_dc_title)) ? "appears in description, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_dc_creator)) ? "appears in author, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_dc_subject)) ? "appears in tags, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_dc_description)) ? "appears in reference, " : "") + ((entry.word().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized, " : "") + ((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "") ); @@ -453,7 +454,7 @@ public class IndexControlRWIs_p { i++; if ((maxlines >= 0) && (i >= maxlines)) break; } - Iterator iter = ranked.miss(); // iterates url hash strings + Iterator iter = ranked.miss(); // iterates url hash strings while (iter.hasNext()) { us = (String) iter.next(); prop.put("genUrlList_urlList_"+i+"_urlExists", "0"); diff --git a/source/de/anomic/index/indexRWIEntry.java b/source/de/anomic/index/indexRWIEntry.java index 613a78dc9..b8429aa59 100644 --- a/source/de/anomic/index/indexRWIEntry.java +++ b/source/de/anomic/index/indexRWIEntry.java @@ -32,12 +32,13 @@ import de.anomic.kelondro.kelondroRow.Entry; public interface indexRWIEntry { // appearance flags, used in RWI entry + // some names are derived from the Dublin Core Metadata tag set // the flags 0..23 are identical to the category flags in plasmaCondenser - public static final int flag_app_reference = 24; // word appears in anchor description text (the reference to an url), or any alternative text field of a link - public static final int flag_app_descr = 25; // word appears in headline (or any description part) - public static final int flag_app_author = 26; // word appears in author - public static final int flag_app_tags = 27; // word appears in header tags - public static final int flag_app_url = 28; // word appears in url + public static final int flag_app_dc_description= 24; // word appears in anchor description text (the reference to an url), or any alternative text field of a link + public static final int flag_app_dc_title = 25; // word appears in title or headline or any description part + public static final int flag_app_dc_creator = 26; // word appears in author + public static final int flag_app_dc_subject = 27; // word appears in header tags or other descriptive part + public static final int flag_app_dc_identifier = 28; // word appears in url or document identifier public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size) public String toPropertyForm(); diff --git a/source/de/anomic/index/indexRWIEntryOrder.java b/source/de/anomic/index/indexRWIEntryOrder.java index 3176fa522..6f7ccbdd0 100644 --- a/source/de/anomic/index/indexRWIEntryOrder.java +++ b/source/de/anomic/index/indexRWIEntryOrder.java @@ -135,11 +135,11 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder imp + ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother) + ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + ( authority(t.urlHash()) << ranking.coeff_authority) - + (((flags.get(indexRWIEntry.flag_app_url)) ? 255 << ranking.coeff_appurl : 0)) - + (((flags.get(indexRWIEntry.flag_app_descr)) ? 255 << ranking.coeff_appdescr : 0)) - + (((flags.get(indexRWIEntry.flag_app_author)) ? 255 << ranking.coeff_appauthor : 0)) - + (((flags.get(indexRWIEntry.flag_app_tags)) ? 255 << ranking.coeff_apptags : 0)) - + (((flags.get(indexRWIEntry.flag_app_reference)) ? 255 << ranking.coeff_appref : 0)) + + (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)) + + (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_appdescr : 0)) + + (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_appauthor : 0)) + + (((flags.get(indexRWIEntry.flag_app_dc_subject)) ? 255 << ranking.coeff_apptags : 0)) + + (((flags.get(indexRWIEntry.flag_app_dc_description)) ? 255 << ranking.coeff_appref : 0)) + (((flags.get(indexRWIEntry.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0)) + (((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0)) + (((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0)) diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 5b0a80cc5..01d0edb6f 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -93,12 +93,12 @@ public final class plasmaCondenser { public static final int flag_cat_entertainment = 10; // boulevard, entertainment, cultural content public static final int flag_cat_knowledge = 11; // science, school stuff, help for homework public static final int flag_cat_computer = 12; // any computer related stuff, networks, operation systems - public static final int flag_cat_p2p = 13; // p2p support, filesharing archives etc. + public static final int flag_cat_p2p = 13; // p2p support, file-sharing archives etc. public static final int flag_cat_sex = 14; // sexual content public static final int flag_cat_spam = 15; // pages that anybody would consider as not interesting public static final int flag_cat_linux = 16; // pages about linux software public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os - public static final int flag_cat_windows = 18; // pages about windows os and softare + public static final int flag_cat_windows = 18; // pages about windows os and software public static final int flag_cat_osreserve = 19; // reserve public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file @@ -131,7 +131,7 @@ public final class plasmaCondenser { //System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia)); - insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_url, RESULT_FLAGS); + insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS); Map.Entry entry; if (indexText) { @@ -148,9 +148,9 @@ public final class plasmaCondenser { // phrase 99 is taken from the media Link url and anchor description // phrase 100 and above are lines from the text - insertTextToWords(document.dc_title(), 1, indexRWIEntry.flag_app_descr, RESULT_FLAGS); - insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_descr, RESULT_FLAGS); - insertTextToWords(document.dc_creator(), 4, indexRWIEntry.flag_app_descr, RESULT_FLAGS); + insertTextToWords(document.dc_title(), 1, indexRWIEntry.flag_app_dc_title, RESULT_FLAGS); + insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS); + insertTextToWords(document.dc_creator(), 4, indexRWIEntry.flag_app_dc_creator, RESULT_FLAGS); // missing: tags! String[] titles = document.getSectionTitles(); for (int i = 0; i < titles.length; i++) { @@ -161,8 +161,9 @@ public final class plasmaCondenser { Iterator> i = document.getAnchors().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 98, indexRWIEntry.flag_app_reference, RESULT_FLAGS); - insertTextToWords((String) entry.getValue(), 98, indexRWIEntry.flag_app_reference, RESULT_FLAGS); + if ((entry == null) || (entry.getKey() == null)) continue; + insertTextToWords(entry.getKey().toNormalform(false, false), 98, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS); + insertTextToWords((String) entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS); } } else { this.RESULT_NUMB_WORDS = 0;