From ad1e4aa88e4aeffd2aa309c257014cd6f0711459 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 1 Dec 2006 16:21:17 +0000 Subject: [PATCH] added selection of audio, video, image and application resources to search procedure. This function can currently not used through the search interface, but only through remote search. added accumulation of search attributes to enable the audio, video, image and application selection. fixed a problem with external URL representation generation git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3036 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 15 +++- htroot/DetailedSearch.java | 2 +- htroot/yacy/search.java | 5 +- htroot/yacysearch.java | 1 + source/de/anomic/index/indexRWIEntryNew.java | 14 ++-- source/de/anomic/index/indexURLEntry.java | 6 ++ source/de/anomic/index/indexURLEntryNew.java | 11 +-- source/de/anomic/index/indexURLEntryOld.java | 25 ++++++ .../anomic/kelondro/kelondroBase64Order.java | 20 ++--- source/de/anomic/plasma/plasmaCondenser.java | 19 ++++- source/de/anomic/plasma/plasmaParser.java | 61 +++++++++----- .../anomic/plasma/plasmaParserDocument.java | 81 +++++++++---------- .../de/anomic/plasma/plasmaSearchEvent.java | 5 ++ .../anomic/plasma/plasmaSearchPreOrder.java | 6 ++ .../de/anomic/plasma/plasmaSearchQuery.java | 22 ++++- .../de/anomic/plasma/plasmaSwitchboard.java | 19 ++--- source/de/anomic/tools/nxTools.java | 2 +- 17 files changed, 205 insertions(+), 109 deletions(-) diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 32282cb88..4692aae0b 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -54,6 +54,7 @@ import java.util.Map; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; import de.anomic.net.URL; @@ -124,7 +125,7 @@ public class CacheAdmin_p { info.append("TITLE:
").append(scraper.getTitle()).append("
").append("
") .append("SECTION HEADLINES:
").append(formatTitles(document.getSectionTitles())).append("
") .append("HREF:
").append(formatAnchor(document.getHyperlinks())).append("
") - .append("IMAGE:
").append(formatAnchor(document.getImagelinks())).append("
") + .append("IMAGE:
").append(formatImageAnchor(document.getImages())).append("
") .append("AUDIO:
").append(formatAnchor(document.getAudiolinks())).append("
") .append("VIDEO:
").append(formatAnchor(document.getVideolinks())).append("
") .append("APPS:
").append(formatAnchor(document.getApplinks())).append("
") @@ -241,6 +242,18 @@ public class CacheAdmin_p { return result.append("").toString(); } + private static String formatImageAnchor(TreeSet anchor) { + final StringBuffer result = new StringBuffer((anchor.size() + 1) * 256); + result.append(""); + final Iterator iter = anchor.iterator(); + htmlFilterImageEntry ie; + while (iter.hasNext()) { + ie = (htmlFilterImageEntry) iter.next(); + result.append(""); + } + return result.append("
").append(ie.alt()).append(" ").append(ie.url().toNormalform()).append("
").toString(); + } + private static String linkPathString(String path, boolean dir){ final String[] elements = path.split("/"); final StringBuffer tmpstr = new StringBuffer(256); diff --git a/htroot/DetailedSearch.java b/htroot/DetailedSearch.java index 0c613969f..a69917c78 100644 --- a/htroot/DetailedSearch.java +++ b/htroot/DetailedSearch.java @@ -135,7 +135,7 @@ public class DetailedSearch { } // do the search - plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", count, searchtime, urlmask, + plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask, ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL, "", 20, plasmaSearchQuery.catchall_constraint); plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString()); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index edc7600d5..96f929166 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -93,6 +93,7 @@ public final class search { final int count = post.getInt("count", 10); // maximum number of wanted results final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); final String prefer = post.get("prefer", ""); + final String contentdom = post.get("contentdom", "text"); final String filter = post.get("filter", ".*"); final boolean includesnippet = post.get("includesnippet", "false").equals("true"); final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______")); @@ -134,7 +135,7 @@ public final class search { plasmaSearchQuery squery = null; if ((query.length() == 0) && (abstractSet != null)) { // this is _not_ a normal search, only a request for index abstracts - squery = new plasmaSearchQuery(abstractSet, maxdist, prefer, count, duetime, filter, plasmaSearchQuery.catchall_constraint); + squery = new plasmaSearchQuery(abstractSet, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, plasmaSearchQuery.catchall_constraint); squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + squery.anonymizedQueryHashes() + " - " + squery.wantedResults + " links"); @@ -161,7 +162,7 @@ public final class search { prop.put("joincount", 0); } else { // retrieve index containers from search request - squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, filter, constraint); + squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, constraint); squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + squery.anonymizedQueryHashes() + " - " + squery.wantedResults + " links"); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 05de0214b..308f4de05 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -240,6 +240,7 @@ public class yacysearch { query, maxDistance, prefermask, + plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask, diff --git a/source/de/anomic/index/indexRWIEntryNew.java b/source/de/anomic/index/indexRWIEntryNew.java index 6c97eb6cf..8d857093d 100644 --- a/source/de/anomic/index/indexRWIEntryNew.java +++ b/source/de/anomic/index/indexRWIEntryNew.java @@ -89,13 +89,13 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry { private static final int col_reserve = 19; // k 1 reserve // appearance flags, used in RWI entry - // the flags 0..15 are identical to the category flags in plasmaCondenser - public static final int flag_app_url = 16; // word appears in url - public static final int flag_app_descr = 17; // word appears in headline (or any description part) - public static final int flag_app_author = 18; // word appears in author - public static final int flag_app_tags = 19; // word appears in header tags - public static final int flag_app_reference = 20; // word appears in anchor description text (the reference to an url), or any alternative text field of a link - public static final int flag_app_emphasized = 21; // word is emphasized in text (i.e. bold, italics, special size) + // the flags 0..23 are identical to the category flags in plasmaCondenser + public static final int flag_app_url = 24; // word appears in url + public static final int flag_app_descr = 25; // word appears in headline (or any description part) + public static final int flag_app_author = 26; // word appears in author + public static final int flag_app_tags = 27; // word appears in header tags + public static final int flag_app_reference = 28; // word appears in anchor description text (the reference to an url), or any alternative text field of a link + public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size) private kelondroRow.Entry entry; diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index a4e994d2d..07c9e59ba 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -49,6 +49,12 @@ public interface indexURLEntry { public String language(); public int size(); public int wordCount(); + public int llocal(); + public int lother(); + public int limage(); + public int laudio(); + public int lvideo(); + public int lapp(); public String snippet(); public kelondroBitfield flags(); public indexRWIEntry word(); diff --git a/source/de/anomic/index/indexURLEntryNew.java b/source/de/anomic/index/indexURLEntryNew.java index 0ac0df9e8..04fb99e5e 100644 --- a/source/de/anomic/index/indexURLEntryNew.java +++ b/source/de/anomic/index/indexURLEntryNew.java @@ -194,6 +194,7 @@ public class indexURLEntryNew implements indexURLEntry { // generate a parseable string; this is a simple property-list indexURLEntry.Components comp = this.comp(); final StringBuffer s = new StringBuffer(300); + //System.out.println("author=" + comp.author()); try { s.append("hash=").append(hash()); s.append(",url=").append(crypt.simpleEncode(comp.url().toNormalform())); @@ -249,11 +250,11 @@ public class indexURLEntryNew implements indexURLEntry { public indexURLEntry.Components comp() { ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8"); return new indexURLEntry.Components( - (cl.size() > 0) ? (String) cl.get(0) : "", - (cl.size() > 1) ? (String) cl.get(1) : "", - (cl.size() > 2) ? (String) cl.get(2) : "", - (cl.size() > 3) ? (String) cl.get(3) : "", - (cl.size() > 4) ? (String) cl.get(4) : ""); + (cl.size() > 0) ? ((String) cl.get(0)).trim() : "", + (cl.size() > 1) ? ((String) cl.get(1)).trim() : "", + (cl.size() > 2) ? ((String) cl.get(2)).trim() : "", + (cl.size() > 3) ? ((String) cl.get(3)).trim() : "", + (cl.size() > 4) ? ((String) cl.get(4)).trim() : ""); } public Date moddate() { diff --git a/source/de/anomic/index/indexURLEntryOld.java b/source/de/anomic/index/indexURLEntryOld.java index b28983c11..c71e5a8e5 100644 --- a/source/de/anomic/index/indexURLEntryOld.java +++ b/source/de/anomic/index/indexURLEntryOld.java @@ -363,4 +363,29 @@ public class indexURLEntryOld implements indexURLEntry { System.out.println(); } + // compatibility methods + public int lapp() { + return 0; + } + + public int laudio() { + return 0; + } + + public int limage() { + return 0; + } + + public int llocal() { + return 0; + } + + public int lother() { + return 0; + } + + public int lvideo() { + return 0; + } + } diff --git a/source/de/anomic/kelondro/kelondroBase64Order.java b/source/de/anomic/kelondro/kelondroBase64Order.java index 2ab6c9b76..6ea96c54e 100644 --- a/source/de/anomic/kelondro/kelondroBase64Order.java +++ b/source/de/anomic/kelondro/kelondroBase64Order.java @@ -46,6 +46,7 @@ package de.anomic.kelondro; +import java.io.UnsupportedEncodingException; import java.util.Comparator; import de.anomic.server.logging.serverLog; @@ -179,13 +180,18 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond } public final String encodeString(String in) { - return encode(in.getBytes()); + try { + return encode(in.getBytes("UTF-8")); + } catch (UnsupportedEncodingException e) { + return ""; + } } // we will use this encoding to encode strings with 2^8 values to // b64-Strings // we will do that by grouping each three input bytes to four output bytes. public final String encode(byte[] in) { + if (in.length == 0) return ""; StringBuffer out = new StringBuffer(in.length / 3 * 4 + 3); int pos = 0; long l; @@ -195,11 +201,8 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond out = out.append(encodeLong(l, 4)); } // now there may be remaining bytes - if (in.length % 3 != 0) - out = out.append((in.length % 3 == 2) ? encodeLong((((0XffL & (long) in[pos]) << 8) + (0XffL & (long) in[pos + 1])) << 8, 4).substring(0, 3) : encodeLong((((0XffL & (long) in[pos])) << 8) << 8, 4).substring(0, 2)); - if (rfc1113compliant) - while (out.length() % 4 > 0) - out.append("="); + if (in.length % 3 != 0) out = out.append((in.length % 3 == 2) ? encodeLong((((0XffL & (long) in[pos]) << 8) + (0XffL & (long) in[pos + 1])) << 8, 4).substring(0, 3) : encodeLong((((0XffL & (long) in[pos])) << 8) << 8, 4).substring(0, 2)); + if (rfc1113compliant) while (out.length() % 4 > 0) out.append("="); // return result return out.toString(); } @@ -215,12 +218,11 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond } public final byte[] decode(String in) { + if ((in == null) || (in.length() == 0)) return new byte[0]; try { int posIn = 0; int posOut = 0; - if (rfc1113compliant) - while (in.charAt(in.length() - 1) == '=') - in = in.substring(0, in.length() - 1); + if (rfc1113compliant) while (in.charAt(in.length() - 1) == '=') in = in.substring(0, in.length() - 1); byte[] out = new byte[in.length() / 4 * 3 + (((in.length() % 4) == 0) ? 0 : in.length() % 4 - 1)]; long l; while (posIn + 3 < in.length()) { diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 4c318bc46..ee7274c82 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -97,7 +97,10 @@ public final class plasmaCondenser { public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os public static final int flag_cat_windows = 18; // pages about windows os and softare public static final int flag_cat_osreserve = 19; // reserve - + public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images + public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file + public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos + public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file private final static int numlength = 5; @@ -117,6 +120,14 @@ public final class plasmaCondenser { public int RESULT_SIMI_SENTENCES = -1; public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4); + public plasmaCondenser(plasmaParserDocument document) throws UnsupportedEncodingException { + this(document.getText(), document.getCharset()); + if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true); + if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true); + if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true); + if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true); + } + public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException { this(text, charset, 3, 2); } @@ -129,7 +140,7 @@ public final class plasmaCondenser { sentences = new HashMap(); createCondensement(text, charset); } - + // create a word hash public static final String word2hash(String word) { return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength); @@ -760,7 +771,7 @@ public final class plasmaCondenser { return new String(s); } - + public static Iterator getWords(InputStream input, String charset) throws UnsupportedEncodingException { if (input == null) return null; plasmaCondenser condenser = new plasmaCondenser(input, charset); @@ -772,7 +783,7 @@ public final class plasmaCondenser { ByteArrayInputStream buffer = new ByteArrayInputStream(text); return getWords(buffer, charset); } - + public static void main(String[] args) { // read a property file and converty them into configuration lines try { diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 807960784..8a2a99ce5 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -72,6 +72,7 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool; import org.apache.commons.pool.impl.GenericObjectPool; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterInputStream; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpc; @@ -819,46 +820,62 @@ public final class plasmaParser { } */ - static Map allReflinks(Map links) { + static Map allReflinks(Set links) { + // links is either a Set of Strings (with urls) or htmlFilterImageEntries // we find all links that are part of a reference inside a url HashMap v = new HashMap(); - Iterator i = links.keySet().iterator(); - String s; + Iterator i = links.iterator(); + Object o; + String url; int pos; loop: while (i.hasNext()) { - s = (String) i.next(); - if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) { + o = i.next(); + if (o instanceof String) url = (String) o; + else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url().toNormalform(); + else { + assert false; + continue; + } + if ((pos = url.toLowerCase().indexOf("http://",7)) > 0) { i.remove(); - s = s.substring(pos); - while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos); - if (!(v.containsKey(s))) v.put(s, "ref"); + url = url.substring(pos); + while ((pos = url.toLowerCase().indexOf("http://",7)) > 0) url = url.substring(pos); + if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; } - if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) { + if ((pos = url.toLowerCase().indexOf("/www.",7)) > 0) { i.remove(); - s = "http:/" + s.substring(pos); - while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos); - if (!(v.containsKey(s))) v.put(s, "ref"); + url = "http:/" + url.substring(pos); + while ((pos = url.toLowerCase().indexOf("/www.",7)) > 0) url = "http:/" + url.substring(pos); + if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; } } return v; } - static Map allSubpaths(Map links) { + static Map allSubpaths(Set links) { + // links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries HashMap v = new HashMap(); - Iterator i = links.keySet().iterator(); - String s; + Iterator i = links.iterator(); + Object o; + String url; int pos; while (i.hasNext()) { - s = (String) i.next(); - if (s.endsWith("/")) s = s.substring(0, s.length() - 1); - pos = s.lastIndexOf("/"); + o = i.next(); + if (o instanceof String) url = (String) o; + else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url().toNormalform(); + else { + assert false; + continue; + } + if (url.endsWith("/")) url = url.substring(0, url.length() - 1); + pos = url.lastIndexOf("/"); while (pos > 8) { - s = s.substring(0, pos + 1); - if (!(v.containsKey(s))) v.put(s, "sub"); - s = s.substring(0, pos); - pos = s.lastIndexOf("/"); + url = url.substring(0, pos + 1); + if (!(v.containsKey(url))) v.put(url, "sub"); + url = url.substring(0, pos); + pos = url.lastIndexOf("/"); } } return v; diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index c7ca527ec..3194f3360 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -61,24 +61,23 @@ import de.anomic.net.URL; public class plasmaParserDocument { - URL location; // the source url - String mimeType; // mimeType as taken from http header - String charset; // the charset of the document - String[] keywords; // most resources provide a keyword field - String shortTitle; // a shortTitle mostly appears in the window header (border) + private URL location; // the source url + private String mimeType; // mimeType as taken from http header + private String charset; // the charset of the document + private String[] keywords; // most resources provide a keyword field + private String shortTitle; // a shortTitle mostly appears in the window header (border) private String longTitle; // the real title of the document, commonly h1-tags - String[] sections; // if present: more titles/headlines appearing in the document - String abstrct; // an abstract, if present: short content description + private String[] sections; // if present: more titles/headlines appearing in the document + private String abstrct; // an abstract, if present: short content description private Object text; // the clear text, all that is visible - Map anchors; // all links embedded as clickeable entities (anchor tags) - TreeSet images; // all visible pictures in document + private Map anchors; // all links embedded as clickeable entities (anchor tags) + private TreeSet images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. - Map hyperlinks, audiolinks, videolinks, imagelinks, applinks; - Map emaillinks; - plasmaCondenser condenser; - boolean resorted; + private Map hyperlinks, audiolinks, videolinks, applinks; + private Map emaillinks; + private boolean resorted; private InputStream textStream; public plasmaParserDocument(URL location, String mimeType, String charset, @@ -99,10 +98,8 @@ public class plasmaParserDocument { this.hyperlinks = null; this.audiolinks = null; this.videolinks = null; - this.imagelinks = null; this.applinks = null; this.emaillinks = null; - this.condenser = null; this.resorted = false; } @@ -125,13 +122,15 @@ public class plasmaParserDocument { this.hyperlinks = null; this.audiolinks = null; this.videolinks = null; - this.imagelinks = null; this.applinks = null; this.emaillinks = null; - this.condenser = null; this.resorted = false; } + public URL getLocation() { + return this.location; + } + public String getMimeType() { return this.mimeType; } @@ -139,7 +138,7 @@ public class plasmaParserDocument { /** * @return the supposed charset of this document or null if unknown */ - public String getSourceCharset() { + public String getCharset() { return this.charset; } @@ -224,12 +223,6 @@ public class plasmaParserDocument { return anchors; } - public TreeSet getImages() { - // returns all links enbedded as pictures (visible in document) - // this resturns a htmlFilterImageEntry collection - if (!resorted) resortLinks(); - return images; - } // the next three methods provide a calculated view on the getAnchors/getImages: @@ -249,9 +242,11 @@ public class plasmaParserDocument { return this.videolinks; } - public Map getImagelinks() { + public TreeSet getImages() { + // returns all links enbedded as pictures (visible in document) + // this resturns a htmlFilterImageEntry collection if (!resorted) resortLinks(); - return this.imagelinks; + return images; } public Map getApplinks() { @@ -275,7 +270,6 @@ public class plasmaParserDocument { String ext = null; i = anchors.entrySet().iterator(); hyperlinks = new HashMap(); - imagelinks = new HashMap(); videolinks = new HashMap(); audiolinks = new HashMap(); applinks = new HashMap(); @@ -301,8 +295,7 @@ public class plasmaParserDocument { if (plasmaParser.mediaExtContains(ext)) { // this is not a normal anchor, its a media link if (plasmaParser.imageExtContains(ext)) { - imagelinks.put(u, entry.getValue()); - collectedImages.add(new htmlFilterImageEntry(url, "", -1, -1)); + collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1)); } else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, entry.getValue()); else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, entry.getValue()); @@ -316,21 +309,7 @@ public class plasmaParserDocument { } } - // expand the hyperlinks: - // we add artificial hyperlinks to the hyperlink set - // that can be calculated from given hyperlinks and imagelinks - hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks)); - hyperlinks.putAll(plasmaParser.allReflinks(imagelinks)); - hyperlinks.putAll(plasmaParser.allReflinks(audiolinks)); - hyperlinks.putAll(plasmaParser.allReflinks(videolinks)); - hyperlinks.putAll(plasmaParser.allReflinks(applinks)); - hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks)); - hyperlinks.putAll(plasmaParser.allSubpaths(imagelinks)); - hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks)); - hyperlinks.putAll(plasmaParser.allSubpaths(videolinks)); - hyperlinks.putAll(plasmaParser.allSubpaths(applinks)); - - // finally add image links that we collected from the anchors to the image map + // add image links that we collected from the anchors to the image map i = collectedImages.iterator(); htmlFilterImageEntry iEntry; while (i.hasNext()) { @@ -338,6 +317,20 @@ public class plasmaParserDocument { if (!images.contains(iEntry)) images.add(iEntry); } + // expand the hyperlinks: + // we add artificial hyperlinks to the hyperlink set + // that can be calculated from given hyperlinks and imagelinks + hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks.keySet())); + hyperlinks.putAll(plasmaParser.allReflinks(images)); + hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet())); + hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet())); + hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet())); + hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet())); + hyperlinks.putAll(plasmaParser.allSubpaths(images)); + hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet())); + hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet())); + hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet())); + // don't do this again this.resorted = true; } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 1d0c45ec3..a1669603a 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -403,6 +403,11 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // filter out bad results Iterator wi = query.queryHashes.iterator(); while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash(), true); + } else if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addResult(page, preranking); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addResult(page, preranking); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addResult(page, preranking); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addResult(page, preranking); } else { acc.addResult(page, preranking); } diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 20d63db54..fefb3acb7 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -101,6 +101,12 @@ public final class plasmaSearchPreOrder { iEntry = (indexRWIEntry) i.next(); if (iEntry.urlHash().length() != container.row().width(container.primarykey())) continue; if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint + if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue; + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue; + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue; + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue; + } pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry); } } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index f318780a7..847d3f1b6 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -61,12 +61,19 @@ public final class plasmaSearchQuery { public static final int SEARCHDOM_GLOBALDHT = 3; public static final int SEARCHDOM_GLOBALALL = 4; + public static final int CONTENTDOM_TEXT = 0; + public static final int CONTENTDOM_IMAGE = 1; + public static final int CONTENTDOM_AUDIO = 2; + public static final int CONTENTDOM_VIDEO = 3; + public static final int CONTENTDOM_APP = 4; + public static final kelondroBitfield empty_constraint = new kelondroBitfield(4, "AAAAAA"); public static final kelondroBitfield catchall_constraint = new kelondroBitfield(4, "______"); public Set queryWords, queryHashes; public int wantedResults; public String prefer; + public int contentdom; public long maximumTime; public String urlMask; public int domType; @@ -75,13 +82,14 @@ public final class plasmaSearchQuery { public int maxDistance; public kelondroBitfield constraint; - public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer, + public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer, int contentdom, int wantedResults, long maximumTime, String urlMask, int domType, String domGroupName, int domMaxTargets, kelondroBitfield constraint) { this.queryWords = queryWords; this.maxDistance = maxDistance; this.prefer = prefer; + this.contentdom = contentdom; this.queryHashes = plasmaCondenser.words2hashes(queryWords); this.wantedResults = wantedResults; this.maximumTime = maximumTime; @@ -92,12 +100,13 @@ public final class plasmaSearchQuery { this.constraint = constraint; } - public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer, + public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer, int contentdom, int wantedResults, long maximumTime, String urlMask, kelondroBitfield constraint) { this.queryWords = null; this.maxDistance = maxDistance; this.prefer = prefer; + this.contentdom = contentdom; this.queryHashes = queryHashes; this.wantedResults = wantedResults; this.maximumTime = maximumTime; @@ -108,6 +117,15 @@ public final class plasmaSearchQuery { this.constraint = constraint; } + public static int contentdomParser(String dom) { + if (dom.equals("text")) return CONTENTDOM_TEXT; + else if (dom.equals("image")) return CONTENTDOM_IMAGE; + else if (dom.equals("audio")) return CONTENTDOM_AUDIO; + else if (dom.equals("video")) return CONTENTDOM_VIDEO; + else if (dom.equals("app")) return CONTENTDOM_APP; + return CONTENTDOM_TEXT; + } + public static Set hashes2Set(String query) { if (query == null) return new HashSet(); final HashSet keyhashes = new HashSet(query.length() / yacySeedDB.commonHashLength); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index b4291f3f1..d4a62654d 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1565,7 +1565,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); log.logFine("Condensing for '" + entry.normalizedURLString() + "'"); - plasmaCondenser condenser = new plasmaCondenser(document.getText(), document.charset); + plasmaCondenser condenser = new plasmaCondenser(document); // generate citation reference Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther] @@ -1593,10 +1593,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser plasmaURL.language(entry.url()), // language ioLinks[0].intValue(), // llocal ioLinks[1].intValue(), // lother - document.audiolinks.size(), // laudio - document.imagelinks.size(), // limage - document.videolinks.size(), // lvideo - document.applinks.size() // lapp + document.getAudiolinks().size(), // laudio + document.getImages().size(), // limage + document.getVideolinks().size(), // lvideo + document.getApplinks().size() // lapp ); /* ======================================================================== * STORE URL TO LOADED-URL-DB @@ -1751,9 +1751,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logInfo("*Indexed " + words + " words in URL " + entry.url() + " [" + entry.urlHash() + "]" + "\n\tDescription: " + docDescription + - "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getSourceCharset() + " | " + + "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getCharset() + " | " + "Size: " + document.getTextLength() + " bytes | " + - "Anchors: " + ((document.anchors==null)?0:document.anchors.size()) + + "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) + "\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " + "ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " + "IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " + @@ -2239,13 +2239,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // parse the resource plasmaParserDocument document = snippetCache.parseDocument(comp.url(), resourceContentLength.longValue(), resourceContent); - // getting parsed body input stream - InputStream docBodyInputStream = document.getText(); - // getting word iterator Iterator witer = null; try { - witer = plasmaCondenser.getWords(docBodyInputStream, document.charset); + witer = new plasmaCondenser(document).words(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } diff --git a/source/de/anomic/tools/nxTools.java b/source/de/anomic/tools/nxTools.java index 604f36a7d..7fae537ab 100644 --- a/source/de/anomic/tools/nxTools.java +++ b/source/de/anomic/tools/nxTools.java @@ -165,7 +165,7 @@ public class nxTools { e = s; while (e < a.length) { b = a[e]; - if ((b == 10) || (b == 13)) break; + if ((b == 10) || (b == 13) || (b == 0)) break; e++; }