diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 32282cb88..4692aae0b 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -54,6 +54,7 @@ import java.util.Map; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; import de.anomic.net.URL; @@ -124,7 +125,7 @@ public class CacheAdmin_p { info.append("TITLE:
").append(scraper.getTitle()).append("
").append("
") .append("SECTION HEADLINES:
").append(formatTitles(document.getSectionTitles())).append("
") .append("HREF:
").append(formatAnchor(document.getHyperlinks())).append("
") - .append("IMAGE:
").append(formatAnchor(document.getImagelinks())).append("
") + .append("IMAGE:
").append(formatImageAnchor(document.getImages())).append("
") .append("AUDIO:
").append(formatAnchor(document.getAudiolinks())).append("
") .append("VIDEO:
").append(formatAnchor(document.getVideolinks())).append("
") .append("APPS:
").append(formatAnchor(document.getApplinks())).append("
") @@ -241,6 +242,18 @@ public class CacheAdmin_p { return result.append("").toString(); } + private static String formatImageAnchor(TreeSet anchor) { + final StringBuffer result = new StringBuffer((anchor.size() + 1) * 256); + result.append(""); + final Iterator iter = anchor.iterator(); + htmlFilterImageEntry ie; + while (iter.hasNext()) { + ie = (htmlFilterImageEntry) iter.next(); + result.append(""); + } + return result.append("
").append(ie.alt()).append(" ").append(ie.url().toNormalform()).append("
").toString(); + } + private static String linkPathString(String path, boolean dir){ final String[] elements = path.split("/"); final StringBuffer tmpstr = new StringBuffer(256); diff --git a/htroot/DetailedSearch.java b/htroot/DetailedSearch.java index 0c613969f..a69917c78 100644 --- a/htroot/DetailedSearch.java +++ b/htroot/DetailedSearch.java @@ -135,7 +135,7 @@ public class DetailedSearch { } // do the search - plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", count, searchtime, urlmask, + plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask, ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL, "", 20, plasmaSearchQuery.catchall_constraint); plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString()); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index edc7600d5..96f929166 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -93,6 +93,7 @@ public final class search { final int count = post.getInt("count", 10); // maximum number of wanted results final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); final String prefer = post.get("prefer", ""); + final String contentdom = post.get("contentdom", "text"); final String filter = post.get("filter", ".*"); final boolean includesnippet = post.get("includesnippet", "false").equals("true"); final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______")); @@ -134,7 +135,7 @@ public final class search { plasmaSearchQuery squery = null; if ((query.length() == 0) && (abstractSet != null)) { // this is _not_ a normal search, only a request for index abstracts - squery = new plasmaSearchQuery(abstractSet, maxdist, prefer, count, duetime, filter, plasmaSearchQuery.catchall_constraint); + squery = new plasmaSearchQuery(abstractSet, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, plasmaSearchQuery.catchall_constraint); squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + squery.anonymizedQueryHashes() + " - " + squery.wantedResults + " links"); @@ -161,7 +162,7 @@ public final class search { prop.put("joincount", 0); } else { // retrieve index containers from search request - squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, filter, constraint); + squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, constraint); squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + squery.anonymizedQueryHashes() + " - " + squery.wantedResults + " links"); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 05de0214b..308f4de05 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -240,6 +240,7 @@ public class yacysearch { query, maxDistance, prefermask, + plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask, diff --git a/source/de/anomic/index/indexRWIEntryNew.java b/source/de/anomic/index/indexRWIEntryNew.java index 6c97eb6cf..8d857093d 100644 --- a/source/de/anomic/index/indexRWIEntryNew.java +++ b/source/de/anomic/index/indexRWIEntryNew.java @@ -89,13 +89,13 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry { private static final int col_reserve = 19; // k 1 reserve // appearance flags, used in RWI entry - // the flags 0..15 are identical to the category flags in plasmaCondenser - public static final int flag_app_url = 16; // word appears in url - public static final int flag_app_descr = 17; // word appears in headline (or any description part) - public static final int flag_app_author = 18; // word appears in author - public static final int flag_app_tags = 19; // word appears in header tags - public static final int flag_app_reference = 20; // word appears in anchor description text (the reference to an url), or any alternative text field of a link - public static final int flag_app_emphasized = 21; // word is emphasized in text (i.e. bold, italics, special size) + // the flags 0..23 are identical to the category flags in plasmaCondenser + public static final int flag_app_url = 24; // word appears in url + public static final int flag_app_descr = 25; // word appears in headline (or any description part) + public static final int flag_app_author = 26; // word appears in author + public static final int flag_app_tags = 27; // word appears in header tags + public static final int flag_app_reference = 28; // word appears in anchor description text (the reference to an url), or any alternative text field of a link + public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size) private kelondroRow.Entry entry; diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index a4e994d2d..07c9e59ba 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -49,6 +49,12 @@ public interface indexURLEntry { public String language(); public int size(); public int wordCount(); + public int llocal(); + public int lother(); + public int limage(); + public int laudio(); + public int lvideo(); + public int lapp(); public String snippet(); public kelondroBitfield flags(); public indexRWIEntry word(); diff --git a/source/de/anomic/index/indexURLEntryNew.java b/source/de/anomic/index/indexURLEntryNew.java index 0ac0df9e8..04fb99e5e 100644 --- a/source/de/anomic/index/indexURLEntryNew.java +++ b/source/de/anomic/index/indexURLEntryNew.java @@ -194,6 +194,7 @@ public class indexURLEntryNew implements indexURLEntry { // generate a parseable string; this is a simple property-list indexURLEntry.Components comp = this.comp(); final StringBuffer s = new StringBuffer(300); + //System.out.println("author=" + comp.author()); try { s.append("hash=").append(hash()); s.append(",url=").append(crypt.simpleEncode(comp.url().toNormalform())); @@ -249,11 +250,11 @@ public class indexURLEntryNew implements indexURLEntry { public indexURLEntry.Components comp() { ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8"); return new indexURLEntry.Components( - (cl.size() > 0) ? (String) cl.get(0) : "", - (cl.size() > 1) ? (String) cl.get(1) : "", - (cl.size() > 2) ? (String) cl.get(2) : "", - (cl.size() > 3) ? (String) cl.get(3) : "", - (cl.size() > 4) ? (String) cl.get(4) : ""); + (cl.size() > 0) ? ((String) cl.get(0)).trim() : "", + (cl.size() > 1) ? ((String) cl.get(1)).trim() : "", + (cl.size() > 2) ? ((String) cl.get(2)).trim() : "", + (cl.size() > 3) ? ((String) cl.get(3)).trim() : "", + (cl.size() > 4) ? ((String) cl.get(4)).trim() : ""); } public Date moddate() { diff --git a/source/de/anomic/index/indexURLEntryOld.java b/source/de/anomic/index/indexURLEntryOld.java index b28983c11..c71e5a8e5 100644 --- a/source/de/anomic/index/indexURLEntryOld.java +++ b/source/de/anomic/index/indexURLEntryOld.java @@ -363,4 +363,29 @@ public class indexURLEntryOld implements indexURLEntry { System.out.println(); } + // compatibility methods + public int lapp() { + return 0; + } + + public int laudio() { + return 0; + } + + public int limage() { + return 0; + } + + public int llocal() { + return 0; + } + + public int lother() { + return 0; + } + + public int lvideo() { + return 0; + } + } diff --git a/source/de/anomic/kelondro/kelondroBase64Order.java b/source/de/anomic/kelondro/kelondroBase64Order.java index 2ab6c9b76..6ea96c54e 100644 --- a/source/de/anomic/kelondro/kelondroBase64Order.java +++ b/source/de/anomic/kelondro/kelondroBase64Order.java @@ -46,6 +46,7 @@ package de.anomic.kelondro; +import java.io.UnsupportedEncodingException; import java.util.Comparator; import de.anomic.server.logging.serverLog; @@ -179,13 +180,18 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond } public final String encodeString(String in) { - return encode(in.getBytes()); + try { + return encode(in.getBytes("UTF-8")); + } catch (UnsupportedEncodingException e) { + return ""; + } } // we will use this encoding to encode strings with 2^8 values to // b64-Strings // we will do that by grouping each three input bytes to four output bytes. public final String encode(byte[] in) { + if (in.length == 0) return ""; StringBuffer out = new StringBuffer(in.length / 3 * 4 + 3); int pos = 0; long l; @@ -195,11 +201,8 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond out = out.append(encodeLong(l, 4)); } // now there may be remaining bytes - if (in.length % 3 != 0) - out = out.append((in.length % 3 == 2) ? encodeLong((((0XffL & (long) in[pos]) << 8) + (0XffL & (long) in[pos + 1])) << 8, 4).substring(0, 3) : encodeLong((((0XffL & (long) in[pos])) << 8) << 8, 4).substring(0, 2)); - if (rfc1113compliant) - while (out.length() % 4 > 0) - out.append("="); + if (in.length % 3 != 0) out = out.append((in.length % 3 == 2) ? encodeLong((((0XffL & (long) in[pos]) << 8) + (0XffL & (long) in[pos + 1])) << 8, 4).substring(0, 3) : encodeLong((((0XffL & (long) in[pos])) << 8) << 8, 4).substring(0, 2)); + if (rfc1113compliant) while (out.length() % 4 > 0) out.append("="); // return result return out.toString(); } @@ -215,12 +218,11 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond } public final byte[] decode(String in) { + if ((in == null) || (in.length() == 0)) return new byte[0]; try { int posIn = 0; int posOut = 0; - if (rfc1113compliant) - while (in.charAt(in.length() - 1) == '=') - in = in.substring(0, in.length() - 1); + if (rfc1113compliant) while (in.charAt(in.length() - 1) == '=') in = in.substring(0, in.length() - 1); byte[] out = new byte[in.length() / 4 * 3 + (((in.length() % 4) == 0) ? 0 : in.length() % 4 - 1)]; long l; while (posIn + 3 < in.length()) { diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 4c318bc46..ee7274c82 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -97,7 +97,10 @@ public final class plasmaCondenser { public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os public static final int flag_cat_windows = 18; // pages about windows os and softare public static final int flag_cat_osreserve = 19; // reserve - + public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images + public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file + public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos + public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file private final static int numlength = 5; @@ -117,6 +120,14 @@ public final class plasmaCondenser { public int RESULT_SIMI_SENTENCES = -1; public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4); + public plasmaCondenser(plasmaParserDocument document) throws UnsupportedEncodingException { + this(document.getText(), document.getCharset()); + if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true); + if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true); + if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true); + if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true); + } + public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException { this(text, charset, 3, 2); } @@ -129,7 +140,7 @@ public final class plasmaCondenser { sentences = new HashMap(); createCondensement(text, charset); } - + // create a word hash public static final String word2hash(String word) { return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength); @@ -760,7 +771,7 @@ public final class plasmaCondenser { return new String(s); } - + public static Iterator getWords(InputStream input, String charset) throws UnsupportedEncodingException { if (input == null) return null; plasmaCondenser condenser = new plasmaCondenser(input, charset); @@ -772,7 +783,7 @@ public final class plasmaCondenser { ByteArrayInputStream buffer = new ByteArrayInputStream(text); return getWords(buffer, charset); } - + public static void main(String[] args) { // read a property file and converty them into configuration lines try { diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 807960784..8a2a99ce5 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -72,6 +72,7 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool; import org.apache.commons.pool.impl.GenericObjectPool; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterInputStream; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpc; @@ -819,46 +820,62 @@ public final class plasmaParser { } */ - static Map allReflinks(Map links) { + static Map allReflinks(Set links) { + // links is either a Set of Strings (with urls) or htmlFilterImageEntries // we find all links that are part of a reference inside a url HashMap v = new HashMap(); - Iterator i = links.keySet().iterator(); - String s; + Iterator i = links.iterator(); + Object o; + String url; int pos; loop: while (i.hasNext()) { - s = (String) i.next(); - if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) { + o = i.next(); + if (o instanceof String) url = (String) o; + else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url().toNormalform(); + else { + assert false; + continue; + } + if ((pos = url.toLowerCase().indexOf("http://",7)) > 0) { i.remove(); - s = s.substring(pos); - while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos); - if (!(v.containsKey(s))) v.put(s, "ref"); + url = url.substring(pos); + while ((pos = url.toLowerCase().indexOf("http://",7)) > 0) url = url.substring(pos); + if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; } - if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) { + if ((pos = url.toLowerCase().indexOf("/www.",7)) > 0) { i.remove(); - s = "http:/" + s.substring(pos); - while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos); - if (!(v.containsKey(s))) v.put(s, "ref"); + url = "http:/" + url.substring(pos); + while ((pos = url.toLowerCase().indexOf("/www.",7)) > 0) url = "http:/" + url.substring(pos); + if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; } } return v; } - static Map allSubpaths(Map links) { + static Map allSubpaths(Set links) { + // links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries HashMap v = new HashMap(); - Iterator i = links.keySet().iterator(); - String s; + Iterator i = links.iterator(); + Object o; + String url; int pos; while (i.hasNext()) { - s = (String) i.next(); - if (s.endsWith("/")) s = s.substring(0, s.length() - 1); - pos = s.lastIndexOf("/"); + o = i.next(); + if (o instanceof String) url = (String) o; + else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url().toNormalform(); + else { + assert false; + continue; + } + if (url.endsWith("/")) url = url.substring(0, url.length() - 1); + pos = url.lastIndexOf("/"); while (pos > 8) { - s = s.substring(0, pos + 1); - if (!(v.containsKey(s))) v.put(s, "sub"); - s = s.substring(0, pos); - pos = s.lastIndexOf("/"); + url = url.substring(0, pos + 1); + if (!(v.containsKey(url))) v.put(url, "sub"); + url = url.substring(0, pos); + pos = url.lastIndexOf("/"); } } return v; diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index c7ca527ec..3194f3360 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -61,24 +61,23 @@ import de.anomic.net.URL; public class plasmaParserDocument { - URL location; // the source url - String mimeType; // mimeType as taken from http header - String charset; // the charset of the document - String[] keywords; // most resources provide a keyword field - String shortTitle; // a shortTitle mostly appears in the window header (border) + private URL location; // the source url + private String mimeType; // mimeType as taken from http header + private String charset; // the charset of the document + private String[] keywords; // most resources provide a keyword field + private String shortTitle; // a shortTitle mostly appears in the window header (border) private String longTitle; // the real title of the document, commonly h1-tags - String[] sections; // if present: more titles/headlines appearing in the document - String abstrct; // an abstract, if present: short content description + private String[] sections; // if present: more titles/headlines appearing in the document + private String abstrct; // an abstract, if present: short content description private Object text; // the clear text, all that is visible - Map anchors; // all links embedded as clickeable entities (anchor tags) - TreeSet images; // all visible pictures in document + private Map anchors; // all links embedded as clickeable entities (anchor tags) + private TreeSet images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. - Map hyperlinks, audiolinks, videolinks, imagelinks, applinks; - Map emaillinks; - plasmaCondenser condenser; - boolean resorted; + private Map hyperlinks, audiolinks, videolinks, applinks; + private Map emaillinks; + private boolean resorted; private InputStream textStream; public plasmaParserDocument(URL location, String mimeType, String charset, @@ -99,10 +98,8 @@ public class plasmaParserDocument { this.hyperlinks = null; this.audiolinks = null; this.videolinks = null; - this.imagelinks = null; this.applinks = null; this.emaillinks = null; - this.condenser = null; this.resorted = false; } @@ -125,13 +122,15 @@ public class plasmaParserDocument { this.hyperlinks = null; this.audiolinks = null; this.videolinks = null; - this.imagelinks = null; this.applinks = null; this.emaillinks = null; - this.condenser = null; this.resorted = false; } + public URL getLocation() { + return this.location; + } + public String getMimeType() { return this.mimeType; } @@ -139,7 +138,7 @@ public class plasmaParserDocument { /** * @return the supposed charset of this document or null if unknown */ - public String getSourceCharset() { + public String getCharset() { return this.charset; } @@ -224,12 +223,6 @@ public class plasmaParserDocument { return anchors; } - public TreeSet getImages() { - // returns all links enbedded as pictures (visible in document) - // this resturns a htmlFilterImageEntry collection - if (!resorted) resortLinks(); - return images; - } // the next three methods provide a calculated view on the getAnchors/getImages: @@ -249,9 +242,11 @@ public class plasmaParserDocument { return this.videolinks; } - public Map getImagelinks() { + public TreeSet getImages() { + // returns all links enbedded as pictures (visible in document) + // this resturns a htmlFilterImageEntry collection if (!resorted) resortLinks(); - return this.imagelinks; + return images; } public Map getApplinks() { @@ -275,7 +270,6 @@ public class plasmaParserDocument { String ext = null; i = anchors.entrySet().iterator(); hyperlinks = new HashMap(); - imagelinks = new HashMap(); videolinks = new HashMap(); audiolinks = new HashMap(); applinks = new HashMap(); @@ -301,8 +295,7 @@ public class plasmaParserDocument { if (plasmaParser.mediaExtContains(ext)) { // this is not a normal anchor, its a media link if (plasmaParser.imageExtContains(ext)) { - imagelinks.put(u, entry.getValue()); - collectedImages.add(new htmlFilterImageEntry(url, "", -1, -1)); + collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1)); } else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, entry.getValue()); else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, entry.getValue()); @@ -316,21 +309,7 @@ public class plasmaParserDocument { } } - // expand the hyperlinks: - // we add artificial hyperlinks to the hyperlink set - // that can be calculated from given hyperlinks and imagelinks - hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks)); - hyperlinks.putAll(plasmaParser.allReflinks(imagelinks)); - hyperlinks.putAll(plasmaParser.allReflinks(audiolinks)); - hyperlinks.putAll(plasmaParser.allReflinks(videolinks)); - hyperlinks.putAll(plasmaParser.allReflinks(applinks)); - hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks)); - hyperlinks.putAll(plasmaParser.allSubpaths(imagelinks)); - hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks)); - hyperlinks.putAll(plasmaParser.allSubpaths(videolinks)); - hyperlinks.putAll(plasmaParser.allSubpaths(applinks)); - - // finally add image links that we collected from the anchors to the image map + // add image links that we collected from the anchors to the image map i = collectedImages.iterator(); htmlFilterImageEntry iEntry; while (i.hasNext()) { @@ -338,6 +317,20 @@ public class plasmaParserDocument { if (!images.contains(iEntry)) images.add(iEntry); } + // expand the hyperlinks: + // we add artificial hyperlinks to the hyperlink set + // that can be calculated from given hyperlinks and imagelinks + hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks.keySet())); + hyperlinks.putAll(plasmaParser.allReflinks(images)); + hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet())); + hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet())); + hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet())); + hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet())); + hyperlinks.putAll(plasmaParser.allSubpaths(images)); + hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet())); + hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet())); + hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet())); + // don't do this again this.resorted = true; } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 1d0c45ec3..a1669603a 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -403,6 +403,11 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // filter out bad results Iterator wi = query.queryHashes.iterator(); while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash(), true); + } else if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addResult(page, preranking); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addResult(page, preranking); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addResult(page, preranking); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addResult(page, preranking); } else { acc.addResult(page, preranking); } diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 20d63db54..fefb3acb7 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -101,6 +101,12 @@ public final class plasmaSearchPreOrder { iEntry = (indexRWIEntry) i.next(); if (iEntry.urlHash().length() != container.row().width(container.primarykey())) continue; if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint + if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue; + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue; + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue; + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue; + } pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry); } } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index f318780a7..847d3f1b6 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -61,12 +61,19 @@ public final class plasmaSearchQuery { public static final int SEARCHDOM_GLOBALDHT = 3; public static final int SEARCHDOM_GLOBALALL = 4; + public static final int CONTENTDOM_TEXT = 0; + public static final int CONTENTDOM_IMAGE = 1; + public static final int CONTENTDOM_AUDIO = 2; + public static final int CONTENTDOM_VIDEO = 3; + public static final int CONTENTDOM_APP = 4; + public static final kelondroBitfield empty_constraint = new kelondroBitfield(4, "AAAAAA"); public static final kelondroBitfield catchall_constraint = new kelondroBitfield(4, "______"); public Set queryWords, queryHashes; public int wantedResults; public String prefer; + public int contentdom; public long maximumTime; public String urlMask; public int domType; @@ -75,13 +82,14 @@ public final class plasmaSearchQuery { public int maxDistance; public kelondroBitfield constraint; - public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer, + public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer, int contentdom, int wantedResults, long maximumTime, String urlMask, int domType, String domGroupName, int domMaxTargets, kelondroBitfield constraint) { this.queryWords = queryWords; this.maxDistance = maxDistance; this.prefer = prefer; + this.contentdom = contentdom; this.queryHashes = plasmaCondenser.words2hashes(queryWords); this.wantedResults = wantedResults; this.maximumTime = maximumTime; @@ -92,12 +100,13 @@ public final class plasmaSearchQuery { this.constraint = constraint; } - public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer, + public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer, int contentdom, int wantedResults, long maximumTime, String urlMask, kelondroBitfield constraint) { this.queryWords = null; this.maxDistance = maxDistance; this.prefer = prefer; + this.contentdom = contentdom; this.queryHashes = queryHashes; this.wantedResults = wantedResults; this.maximumTime = maximumTime; @@ -108,6 +117,15 @@ public final class plasmaSearchQuery { this.constraint = constraint; } + public static int contentdomParser(String dom) { + if (dom.equals("text")) return CONTENTDOM_TEXT; + else if (dom.equals("image")) return CONTENTDOM_IMAGE; + else if (dom.equals("audio")) return CONTENTDOM_AUDIO; + else if (dom.equals("video")) return CONTENTDOM_VIDEO; + else if (dom.equals("app")) return CONTENTDOM_APP; + return CONTENTDOM_TEXT; + } + public static Set hashes2Set(String query) { if (query == null) return new HashSet(); final HashSet keyhashes = new HashSet(query.length() / yacySeedDB.commonHashLength); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index b4291f3f1..d4a62654d 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1565,7 +1565,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); log.logFine("Condensing for '" + entry.normalizedURLString() + "'"); - plasmaCondenser condenser = new plasmaCondenser(document.getText(), document.charset); + plasmaCondenser condenser = new plasmaCondenser(document); // generate citation reference Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther] @@ -1593,10 +1593,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser plasmaURL.language(entry.url()), // language ioLinks[0].intValue(), // llocal ioLinks[1].intValue(), // lother - document.audiolinks.size(), // laudio - document.imagelinks.size(), // limage - document.videolinks.size(), // lvideo - document.applinks.size() // lapp + document.getAudiolinks().size(), // laudio + document.getImages().size(), // limage + document.getVideolinks().size(), // lvideo + document.getApplinks().size() // lapp ); /* ======================================================================== * STORE URL TO LOADED-URL-DB @@ -1751,9 +1751,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logInfo("*Indexed " + words + " words in URL " + entry.url() + " [" + entry.urlHash() + "]" + "\n\tDescription: " + docDescription + - "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getSourceCharset() + " | " + + "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getCharset() + " | " + "Size: " + document.getTextLength() + " bytes | " + - "Anchors: " + ((document.anchors==null)?0:document.anchors.size()) + + "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) + "\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " + "ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " + "IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " + @@ -2239,13 +2239,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // parse the resource plasmaParserDocument document = snippetCache.parseDocument(comp.url(), resourceContentLength.longValue(), resourceContent); - // getting parsed body input stream - InputStream docBodyInputStream = document.getText(); - // getting word iterator Iterator witer = null; try { - witer = plasmaCondenser.getWords(docBodyInputStream, document.charset); + witer = new plasmaCondenser(document).words(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } diff --git a/source/de/anomic/tools/nxTools.java b/source/de/anomic/tools/nxTools.java index 604f36a7d..7fae537ab 100644 --- a/source/de/anomic/tools/nxTools.java +++ b/source/de/anomic/tools/nxTools.java @@ -165,7 +165,7 @@ public class nxTools { e = s; while (e < a.length) { b = a[e]; - if ((b == 10) || (b == 13)) break; + if ((b == 10) || (b == 13) || (b == 0)) break; e++; }