- fixed 'appears in url' flag in index generation

- extended index administration page, shows some properties to the web links now

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4216 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 19176e12e2
commit 3491531cea

@ -80,11 +80,36 @@
::
<p>#[count]# URL entries related to this word hash <span class="tt">#[keyHash]#</span></p>
<form action="IndexControl_p.html" method="post" enctype="multipart/form-data">
<p>#{urlList}# #(urlExists)#
<input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" checked value="#[urlhxValue]#" align="top" /><label for="urlhx.#[urlhxCount]#" class="tt">#[urlhxValue]#&nbsp;&lt;unresolved URL Hash&gt;</span><br />
::<input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" #(urlhxChecked)#::checked="checked" #(/urlhxChecked)#value="#[urlhxValue]#" align="top" />
<a href="/IndexControl_p.html?keystring=#[keyString]#&amp;keyhash=#[keyHash]#&amp;urlhash=#[urlhxValue]#&amp;urlstringsearch=&amp;urlstring=#[urlString]#" class="tt">#[urlhxValue]#</a><label for="urlhx.#[urlhxCount]#" class="tt">&nbsp;#[urlString]#, pos=#[pos]#</label><br />
#(/urlExists)# #{/urlList}#</p>
<p>
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader"><td>&nbsp;</td><td>hash</td><td>url</td><td>pos</td><td>phrase</td><td>urlcomps</td><td>urllength</td><td width="60%">props</td></tr>
#{urlList}#
<tr class="TableCellLight">
#(urlExists)#
<td><input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" checked value="#[urlhxValue]#" align="top" />
<label for="urlhx.#[urlhxCount]#" class="tt">#[urlhxValue]#</label></td>
<td>&lt;unresolved URL Hash&gt;</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</span>
::
<td><input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" #(urlhxChecked)#::checked="checked" #(/urlhxChecked)#value="#[urlhxValue]#" align="top" />
<label for="urlhx.#[urlhxCount]#" class="tt"></label></td>
<td><a href="/IndexControl_p.html?keystring=#[keyString]#&amp;keyhash=#[keyHash]#&amp;urlhash=#[urlhxValue]#&amp;urlstringsearch=&amp;urlstring=#[urlString]#" class="tt">#[urlhxValue]#</a></td>
<td><a href="#[urlString]#">#[urlStringShort]#</a></td>
<td>#[pos]#</td>
<td>#[phrase]#</td>
<td>#[urlcomps]#</td>
<td>#[urllength]#</td>
<td>#[props]#</td>
#(/urlExists)#
</tr>
#{/urlList}#
</table>
<input type="hidden" name="keystring" value="#[keyString]#" />
<input type="hidden" name="keyhash" value="#[keyHash]#" />
<input type="hidden" name="urlstring" value="" />

@ -54,6 +54,7 @@ import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
@ -502,42 +503,63 @@ public class IndexControl_p {
} else {
final Iterator en = index.entries();
prop.put("genUrlList", "2");
String us;
String uh[] = new String[2];
int i = 0;
// first generate a new map where the urls are sorted (not by hash but by the url text)
final TreeMap tm = new TreeMap();
indexRWIEntry xi;
indexURLEntry le;
Object[] wu;
while (en.hasNext()) {
xi = (indexRWIEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
indexURLEntry le = switchboard.wordIndex.loadedURL.load(uh[0], null);
le = switchboard.wordIndex.loadedURL.load(xi.urlHash(), null);
wu = new Object[]{xi, le};
if (le == null) {
tm.put(uh[0], uh);
tm.put(xi.urlHash(), wu);
} else {
us = le.comp().url().toNormalform(false, true);
tm.put(us, uh);
tm.put(le.comp().url().toNormalform(false, true), wu);
}
}
yacyURL url;
final Iterator iter = tm.keySet().iterator();
final Iterator iter = tm.entrySet().iterator();
Map.Entry entry;
String us;
while (iter.hasNext()) {
us = iter.next().toString();
uh = (String[]) tm.get(us);
if (us.equals(uh[0])) {
entry = (Map.Entry) iter.next();
us = (String) entry.getKey();
wu = (Object[]) entry.getValue();
xi = (indexRWIEntry) wu[0];
le = (indexURLEntry) wu[1];
if (us.equals(xi.urlHash())) {
prop.put("genUrlList_urlList_"+i+"_urlExists", "0");
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", uh[0]);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", xi.urlHash());
} else {
prop.put("genUrlList_urlList_"+i+"_urlExists", "1");
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", uh[0]);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", xi.urlHash());
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring);
prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhash);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us);
prop.put("genUrlList_urlList_"+i+"_urlExists_pos", uh[1]);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 60) ? (us.substring(0, 60) + "...") : us);
prop.put("genUrlList_urlList_"+i+"_urlExists_pos", xi.posintext());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", xi.posofphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_urlcomps", xi.urlcomps());
prop.put("genUrlList_urlList_"+i+"_urlExists_urllength", xi.urllength());
prop.put("genUrlList_urlList_"+i+"_urlExists_props",
((xi.flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") +
((xi.flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
((xi.flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") +
((xi.flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") +
((xi.flags().get(indexRWIEntry.flag_app_url)) ? "appears in url, " : "") +
((xi.flags().get(indexRWIEntry.flag_app_descr)) ? "appears in description, " : "") +
((xi.flags().get(indexRWIEntry.flag_app_author)) ? "appears in author, " : "") +
((xi.flags().get(indexRWIEntry.flag_app_tags)) ? "appears in tags, " : "") +
((xi.flags().get(indexRWIEntry.flag_app_reference)) ? "appears in reference, " : "") +
((xi.flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized" : "")
);
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", xi.posofphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", xi.posofphrase());
url = new yacyURL(us, null);
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, url)) {
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxChecked", "1");

@ -33,11 +33,11 @@ public interface indexRWIEntry {
// appearance flags, used in RWI entry
// the flags 0..23 are identical to the category flags in plasmaCondenser
public static final int flag_app_url = 24; // word appears in url
public static final int flag_app_reference = 24; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
public static final int flag_app_descr = 25; // word appears in headline (or any description part)
public static final int flag_app_author = 26; // word appears in author
public static final int flag_app_tags = 27; // word appears in header tags
public static final int flag_app_reference = 28; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
public static final int flag_app_url = 28; // word appears in url
public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size)
public String toPropertyForm();

@ -87,16 +87,7 @@ public final class indexRWIRowEntry implements indexRWIEntry {
private static final int col_posofphrase = 17; // o 1 number of the phrase where word appears
private static final int col_worddistance = 18; // i 1 initial zero; may be used as reserve: is filled during search
private static final int col_reserve = 19; // k 1 reserve
// appearance flags, used in RWI entry
// the flags 0..23 are identical to the category flags in plasmaCondenser
public static final int flag_app_url = 24; // word appears in url
public static final int flag_app_descr = 25; // word appears in headline (or any description part)
public static final int flag_app_author = 26; // word appears in author
public static final int flag_app_tags = 27; // word appears in header tags
public static final int flag_app_reference = 28; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size)
private kelondroRow.Entry entry;
public indexRWIRowEntry(String urlHash,

@ -160,7 +160,7 @@ public class indexURLEntry {
this.entry.setCol(col_laudio, laudio);
this.entry.setCol(col_lvideo, lvideo);
this.entry.setCol(col_lapp, lapp);
System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString());
//System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString());
this.snippet = null;
this.word = null;
}

@ -126,20 +126,19 @@ public final class plasmaCondenser {
this.wordcut = 2;
this.words = new TreeMap();
this.sentences = new HashMap();
this.RESULT_FLAGS = new kelondroBitfield(4);
//System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));
// construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
Map.Entry entry;
if (indexText) {
createCondensement(document.getText(), document.getCharset());
kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19
// construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
createCondensement(document.getText(), document.getCharset());
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainTitle
@ -152,22 +151,22 @@ public final class plasmaCondenser {
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(document.getTitle(), 1, indexRWIEntry.flag_app_descr, wflags);
//insertTextToWords(document.getTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getAbstract(), 3, indexRWIEntry.flag_app_descr, wflags);
insertTextToWords(document.getAuthor(), 4, indexRWIEntry.flag_app_descr, wflags);
insertTextToWords(document.getLocation().toNormalform(false, true), 0, indexRWIEntry.flag_app_url, RESULT_FLAGS);
insertTextToWords(document.getTitle(), 1, indexRWIEntry.flag_app_descr, RESULT_FLAGS);
insertTextToWords(document.getAbstract(), 3, indexRWIEntry.flag_app_descr, RESULT_FLAGS);
insertTextToWords(document.getAuthor(), 4, indexRWIEntry.flag_app_descr, RESULT_FLAGS);
// missing: tags!
String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {
insertTextToWords(titles[i], i + 10, indexRWIEntry.flag_app_emphasized, wflags);
insertTextToWords(titles[i], i + 10, indexRWIEntry.flag_app_emphasized, RESULT_FLAGS);
}
// anchors
Iterator i = document.getAnchors().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 98, indexRWIEntry.flag_app_url, wflags);
insertTextToWords((String) entry.getValue(), 98, indexRWIEntry.flag_app_url, wflags);
insertTextToWords((String) entry.getKey(), 98, indexRWIEntry.flag_app_reference, RESULT_FLAGS);
insertTextToWords((String) entry.getValue(), 98, indexRWIEntry.flag_app_reference, RESULT_FLAGS);
}
} else {
this.RESULT_NUMB_WORDS = 0;
@ -177,30 +176,28 @@ public final class plasmaCondenser {
}
if (indexMedia) {
kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19
// audio
Iterator i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, wflags);
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, RESULT_FLAGS);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS);
}
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, wflags);
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, RESULT_FLAGS);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS);
}
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, wflags);
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, RESULT_FLAGS);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS);
}
// images
@ -208,8 +205,8 @@ public final class plasmaCondenser {
htmlFilterImageEntry ientry;
while (i.hasNext()) {
ientry = (htmlFilterImageEntry) i.next();
insertTextToWords(ientry.url().toNormalform(false, true), 99, flag_cat_hasimage, wflags);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, wflags);
insertTextToWords(ientry.url().toNormalform(false, true), 99, flag_cat_hasimage, RESULT_FLAGS);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS);
}
// finally check all words for missing flag entry
@ -219,7 +216,7 @@ public final class plasmaCondenser {
entry = (Map.Entry) i.next();
wprop = (wordStatProp) entry.getValue();
if (wprop.flags == null) {
wprop.flags = (kelondroBitfield) wflags.clone();
wprop.flags = (kelondroBitfield) RESULT_FLAGS.clone();
words.put(entry.getKey(), wprop);
}
}
@ -241,8 +238,6 @@ public final class plasmaCondenser {
wprop = (wordStatProp) words.get(word);
if (wprop == null) wprop = new wordStatProp(0, pip, phrase);
if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone();
wprop.numOfPhrase = 1;
wprop.posInPhrase = pip;
wprop.flags.set(flagpos, true);
words.put(word, wprop);
pip++;

Loading…
Cancel
Save