added selection of audio, video, image and application resources

to search procedure. This function can currently not used through the
search interface, but only through remote search.

added accumulation of search attributes to enable the audio, video,
image and application selection.

fixed a problem with external URL representation generation


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3036 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent fb7902aa68
commit ad1e4aa88e

@ -54,6 +54,7 @@ import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
@ -124,7 +125,7 @@ public class CacheAdmin_p {
info.append("<b>TITLE:</b><br>").append(scraper.getTitle()).append("<br>").append("<br>")
.append("<b>SECTION HEADLINES:</b><br>").append(formatTitles(document.getSectionTitles())).append("<br>")
.append("<b>HREF:</b><br>").append(formatAnchor(document.getHyperlinks())).append("<br>")
.append("<b>IMAGE:</b><br>").append(formatAnchor(document.getImagelinks())).append("<br>")
.append("<b>IMAGE:</b><br>").append(formatImageAnchor(document.getImages())).append("<br>")
.append("<b>AUDIO:</b><br>").append(formatAnchor(document.getAudiolinks())).append("<br>")
.append("<b>VIDEO:</b><br>").append(formatAnchor(document.getVideolinks())).append("<br>")
.append("<b>APPS:</b><br>").append(formatAnchor(document.getApplinks())).append("<br>")
@ -241,6 +242,18 @@ public class CacheAdmin_p {
return result.append("</table>").toString();
}
private static String formatImageAnchor(TreeSet anchor) {
final StringBuffer result = new StringBuffer((anchor.size() + 1) * 256);
result.append("<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\">");
final Iterator iter = anchor.iterator();
htmlFilterImageEntry ie;
while (iter.hasNext()) {
ie = (htmlFilterImageEntry) iter.next();
result.append("<tr valign=\"top\"><td><span class=\"small\">").append(ie.alt()).append("&nbsp;</span></td><td class=\"tt\">").append(ie.url().toNormalform()).append("</td></tr>");
}
return result.append("</table>").toString();
}
private static String linkPathString(String path, boolean dir){
final String[] elements = path.split("/");
final StringBuffer tmpstr = new StringBuffer(256);

@ -135,7 +135,7 @@ public class DetailedSearch {
}
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", count, searchtime, urlmask,
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20, plasmaSearchQuery.catchall_constraint);
plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString());

@ -93,6 +93,7 @@ public final class search {
final int count = post.getInt("count", 10); // maximum number of wanted results
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
final String prefer = post.get("prefer", "");
final String contentdom = post.get("contentdom", "text");
final String filter = post.get("filter", ".*");
final boolean includesnippet = post.get("includesnippet", "false").equals("true");
final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______"));
@ -134,7 +135,7 @@ public final class search {
plasmaSearchQuery squery = null;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
squery = new plasmaSearchQuery(abstractSet, maxdist, prefer, count, duetime, filter, plasmaSearchQuery.catchall_constraint);
squery = new plasmaSearchQuery(abstractSet, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, plasmaSearchQuery.catchall_constraint);
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + squery.anonymizedQueryHashes() + " - " + squery.wantedResults + " links");
@ -161,7 +162,7 @@ public final class search {
prop.put("joincount", 0);
} else {
// retrieve index containers from search request
squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, filter, constraint);
squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, constraint);
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + squery.anonymizedQueryHashes() + " - " + squery.wantedResults + " links");

@ -240,6 +240,7 @@ public class yacysearch {
query,
maxDistance,
prefermask,
plasmaSearchQuery.CONTENTDOM_TEXT,
count,
searchtime,
urlmask,

@ -89,13 +89,13 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
private static final int col_reserve = 19; // k 1 reserve
// appearance flags, used in RWI entry
// the flags 0..15 are identical to the category flags in plasmaCondenser
public static final int flag_app_url = 16; // word appears in url
public static final int flag_app_descr = 17; // word appears in headline (or any description part)
public static final int flag_app_author = 18; // word appears in author
public static final int flag_app_tags = 19; // word appears in header tags
public static final int flag_app_reference = 20; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
public static final int flag_app_emphasized = 21; // word is emphasized in text (i.e. bold, italics, special size)
// the flags 0..23 are identical to the category flags in plasmaCondenser
public static final int flag_app_url = 24; // word appears in url
public static final int flag_app_descr = 25; // word appears in headline (or any description part)
public static final int flag_app_author = 26; // word appears in author
public static final int flag_app_tags = 27; // word appears in header tags
public static final int flag_app_reference = 28; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size)
private kelondroRow.Entry entry;

@ -49,6 +49,12 @@ public interface indexURLEntry {
public String language();
public int size();
public int wordCount();
public int llocal();
public int lother();
public int limage();
public int laudio();
public int lvideo();
public int lapp();
public String snippet();
public kelondroBitfield flags();
public indexRWIEntry word();

@ -194,6 +194,7 @@ public class indexURLEntryNew implements indexURLEntry {
// generate a parseable string; this is a simple property-list
indexURLEntry.Components comp = this.comp();
final StringBuffer s = new StringBuffer(300);
//System.out.println("author=" + comp.author());
try {
s.append("hash=").append(hash());
s.append(",url=").append(crypt.simpleEncode(comp.url().toNormalform()));
@ -249,11 +250,11 @@ public class indexURLEntryNew implements indexURLEntry {
public indexURLEntry.Components comp() {
ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8");
return new indexURLEntry.Components(
(cl.size() > 0) ? (String) cl.get(0) : "",
(cl.size() > 1) ? (String) cl.get(1) : "",
(cl.size() > 2) ? (String) cl.get(2) : "",
(cl.size() > 3) ? (String) cl.get(3) : "",
(cl.size() > 4) ? (String) cl.get(4) : "");
(cl.size() > 0) ? ((String) cl.get(0)).trim() : "",
(cl.size() > 1) ? ((String) cl.get(1)).trim() : "",
(cl.size() > 2) ? ((String) cl.get(2)).trim() : "",
(cl.size() > 3) ? ((String) cl.get(3)).trim() : "",
(cl.size() > 4) ? ((String) cl.get(4)).trim() : "");
}
public Date moddate() {

@ -363,4 +363,29 @@ public class indexURLEntryOld implements indexURLEntry {
System.out.println();
}
// compatibility methods
public int lapp() {
return 0;
}
public int laudio() {
return 0;
}
public int limage() {
return 0;
}
public int llocal() {
return 0;
}
public int lother() {
return 0;
}
public int lvideo() {
return 0;
}
}

@ -46,6 +46,7 @@
package de.anomic.kelondro;
import java.io.UnsupportedEncodingException;
import java.util.Comparator;
import de.anomic.server.logging.serverLog;
@ -179,13 +180,18 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond
}
public final String encodeString(String in) {
return encode(in.getBytes());
try {
return encode(in.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
return "";
}
}
// we will use this encoding to encode strings with 2^8 values to
// b64-Strings
// we will do that by grouping each three input bytes to four output bytes.
public final String encode(byte[] in) {
if (in.length == 0) return "";
StringBuffer out = new StringBuffer(in.length / 3 * 4 + 3);
int pos = 0;
long l;
@ -195,11 +201,8 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond
out = out.append(encodeLong(l, 4));
}
// now there may be remaining bytes
if (in.length % 3 != 0)
out = out.append((in.length % 3 == 2) ? encodeLong((((0XffL & (long) in[pos]) << 8) + (0XffL & (long) in[pos + 1])) << 8, 4).substring(0, 3) : encodeLong((((0XffL & (long) in[pos])) << 8) << 8, 4).substring(0, 2));
if (rfc1113compliant)
while (out.length() % 4 > 0)
out.append("=");
if (in.length % 3 != 0) out = out.append((in.length % 3 == 2) ? encodeLong((((0XffL & (long) in[pos]) << 8) + (0XffL & (long) in[pos + 1])) << 8, 4).substring(0, 3) : encodeLong((((0XffL & (long) in[pos])) << 8) << 8, 4).substring(0, 2));
if (rfc1113compliant) while (out.length() % 4 > 0) out.append("=");
// return result
return out.toString();
}
@ -215,12 +218,11 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond
}
public final byte[] decode(String in) {
if ((in == null) || (in.length() == 0)) return new byte[0];
try {
int posIn = 0;
int posOut = 0;
if (rfc1113compliant)
while (in.charAt(in.length() - 1) == '=')
in = in.substring(0, in.length() - 1);
if (rfc1113compliant) while (in.charAt(in.length() - 1) == '=') in = in.substring(0, in.length() - 1);
byte[] out = new byte[in.length() / 4 * 3 + (((in.length() % 4) == 0) ? 0 : in.length() % 4 - 1)];
long l;
while (posIn + 3 < in.length()) {

@ -97,7 +97,10 @@ public final class plasmaCondenser {
public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os
public static final int flag_cat_windows = 18; // pages about windows os and softare
public static final int flag_cat_osreserve = 19; // reserve
public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
private final static int numlength = 5;
@ -117,6 +120,14 @@ public final class plasmaCondenser {
public int RESULT_SIMI_SENTENCES = -1;
public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
public plasmaCondenser(plasmaParserDocument document) throws UnsupportedEncodingException {
this(document.getText(), document.getCharset());
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
}
public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException {
this(text, charset, 3, 2);
}
@ -129,7 +140,7 @@ public final class plasmaCondenser {
sentences = new HashMap();
createCondensement(text, charset);
}
// create a word hash
public static final String word2hash(String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength);
@ -760,7 +771,7 @@ public final class plasmaCondenser {
return new String(s);
}
public static Iterator getWords(InputStream input, String charset) throws UnsupportedEncodingException {
if (input == null) return null;
plasmaCondenser condenser = new plasmaCondenser(input, charset);
@ -772,7 +783,7 @@ public final class plasmaCondenser {
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
return getWords(buffer, charset);
}
public static void main(String[] args) {
// read a property file and converty them into configuration lines
try {

@ -72,6 +72,7 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.htmlFilter.htmlFilterInputStream;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpc;
@ -819,46 +820,62 @@ public final class plasmaParser {
}
*/
static Map allReflinks(Map links) {
static Map allReflinks(Set links) {
// links is either a Set of Strings (with urls) or htmlFilterImageEntries
// we find all links that are part of a reference inside a url
HashMap v = new HashMap();
Iterator i = links.keySet().iterator();
String s;
Iterator i = links.iterator();
Object o;
String url;
int pos;
loop: while (i.hasNext()) {
s = (String) i.next();
if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) {
o = i.next();
if (o instanceof String) url = (String) o;
else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url().toNormalform();
else {
assert false;
continue;
}
if ((pos = url.toLowerCase().indexOf("http://",7)) > 0) {
i.remove();
s = s.substring(pos);
while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos);
if (!(v.containsKey(s))) v.put(s, "ref");
url = url.substring(pos);
while ((pos = url.toLowerCase().indexOf("http://",7)) > 0) url = url.substring(pos);
if (!(v.containsKey(url))) v.put(url, "ref");
continue loop;
}
if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) {
if ((pos = url.toLowerCase().indexOf("/www.",7)) > 0) {
i.remove();
s = "http:/" + s.substring(pos);
while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos);
if (!(v.containsKey(s))) v.put(s, "ref");
url = "http:/" + url.substring(pos);
while ((pos = url.toLowerCase().indexOf("/www.",7)) > 0) url = "http:/" + url.substring(pos);
if (!(v.containsKey(url))) v.put(url, "ref");
continue loop;
}
}
return v;
}
static Map allSubpaths(Map links) {
static Map allSubpaths(Set links) {
// links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries
HashMap v = new HashMap();
Iterator i = links.keySet().iterator();
String s;
Iterator i = links.iterator();
Object o;
String url;
int pos;
while (i.hasNext()) {
s = (String) i.next();
if (s.endsWith("/")) s = s.substring(0, s.length() - 1);
pos = s.lastIndexOf("/");
o = i.next();
if (o instanceof String) url = (String) o;
else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url().toNormalform();
else {
assert false;
continue;
}
if (url.endsWith("/")) url = url.substring(0, url.length() - 1);
pos = url.lastIndexOf("/");
while (pos > 8) {
s = s.substring(0, pos + 1);
if (!(v.containsKey(s))) v.put(s, "sub");
s = s.substring(0, pos);
pos = s.lastIndexOf("/");
url = url.substring(0, pos + 1);
if (!(v.containsKey(url))) v.put(url, "sub");
url = url.substring(0, pos);
pos = url.lastIndexOf("/");
}
}
return v;

@ -61,24 +61,23 @@ import de.anomic.net.URL;
public class plasmaParserDocument {
URL location; // the source url
String mimeType; // mimeType as taken from http header
String charset; // the charset of the document
String[] keywords; // most resources provide a keyword field
String shortTitle; // a shortTitle mostly appears in the window header (border)
private URL location; // the source url
private String mimeType; // mimeType as taken from http header
private String charset; // the charset of the document
private String[] keywords; // most resources provide a keyword field
private String shortTitle; // a shortTitle mostly appears in the window header (border)
private String longTitle; // the real title of the document, commonly h1-tags
String[] sections; // if present: more titles/headlines appearing in the document
String abstrct; // an abstract, if present: short content description
private String[] sections; // if present: more titles/headlines appearing in the document
private String abstrct; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
Map anchors; // all links embedded as clickeable entities (anchor tags)
TreeSet images; // all visible pictures in document
private Map anchors; // all links embedded as clickeable entities (anchor tags)
private TreeSet images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
Map hyperlinks, audiolinks, videolinks, imagelinks, applinks;
Map emaillinks;
plasmaCondenser condenser;
boolean resorted;
private Map hyperlinks, audiolinks, videolinks, applinks;
private Map emaillinks;
private boolean resorted;
private InputStream textStream;
public plasmaParserDocument(URL location, String mimeType, String charset,
@ -99,10 +98,8 @@ public class plasmaParserDocument {
this.hyperlinks = null;
this.audiolinks = null;
this.videolinks = null;
this.imagelinks = null;
this.applinks = null;
this.emaillinks = null;
this.condenser = null;
this.resorted = false;
}
@ -125,13 +122,15 @@ public class plasmaParserDocument {
this.hyperlinks = null;
this.audiolinks = null;
this.videolinks = null;
this.imagelinks = null;
this.applinks = null;
this.emaillinks = null;
this.condenser = null;
this.resorted = false;
}
public URL getLocation() {
return this.location;
}
public String getMimeType() {
return this.mimeType;
}
@ -139,7 +138,7 @@ public class plasmaParserDocument {
/**
* @return the supposed charset of this document or <code>null</code> if unknown
*/
public String getSourceCharset() {
public String getCharset() {
return this.charset;
}
@ -224,12 +223,6 @@ public class plasmaParserDocument {
return anchors;
}
public TreeSet getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
return images;
}
// the next three methods provide a calculated view on the getAnchors/getImages:
@ -249,9 +242,11 @@ public class plasmaParserDocument {
return this.videolinks;
}
public Map getImagelinks() {
public TreeSet getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
return this.imagelinks;
return images;
}
public Map getApplinks() {
@ -275,7 +270,6 @@ public class plasmaParserDocument {
String ext = null;
i = anchors.entrySet().iterator();
hyperlinks = new HashMap();
imagelinks = new HashMap();
videolinks = new HashMap();
audiolinks = new HashMap();
applinks = new HashMap();
@ -301,8 +295,7 @@ public class plasmaParserDocument {
if (plasmaParser.mediaExtContains(ext)) {
// this is not a normal anchor, its a media link
if (plasmaParser.imageExtContains(ext)) {
imagelinks.put(u, entry.getValue());
collectedImages.add(new htmlFilterImageEntry(url, "", -1, -1));
collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
}
else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, entry.getValue());
else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, entry.getValue());
@ -316,21 +309,7 @@ public class plasmaParserDocument {
}
}
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));
hyperlinks.putAll(plasmaParser.allReflinks(imagelinks));
hyperlinks.putAll(plasmaParser.allReflinks(audiolinks));
hyperlinks.putAll(plasmaParser.allReflinks(videolinks));
hyperlinks.putAll(plasmaParser.allReflinks(applinks));
hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));
hyperlinks.putAll(plasmaParser.allSubpaths(imagelinks));
hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks));
hyperlinks.putAll(plasmaParser.allSubpaths(videolinks));
hyperlinks.putAll(plasmaParser.allSubpaths(applinks));
// finally add image links that we collected from the anchors to the image map
// add image links that we collected from the anchors to the image map
i = collectedImages.iterator();
htmlFilterImageEntry iEntry;
while (i.hasNext()) {
@ -338,6 +317,20 @@ public class plasmaParserDocument {
if (!images.contains(iEntry)) images.add(iEntry);
}
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks.keySet()));
hyperlinks.putAll(plasmaParser.allReflinks(images));
hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet()));
hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet()));
hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(images));
hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet()));
// don't do this again
this.resorted = true;
}

@ -403,6 +403,11 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// filter out bad results
Iterator wi = query.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash(), true);
} else if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addResult(page, preranking);
else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addResult(page, preranking);
else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addResult(page, preranking);
else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addResult(page, preranking);
} else {
acc.addResult(page, preranking);
}

@ -101,6 +101,12 @@ public final class plasmaSearchPreOrder {
iEntry = (indexRWIEntry) i.next();
if (iEntry.urlHash().length() != container.row().width(container.primarykey())) continue;
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint
if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
}
pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
}
}

@ -61,12 +61,19 @@ public final class plasmaSearchQuery {
public static final int SEARCHDOM_GLOBALDHT = 3;
public static final int SEARCHDOM_GLOBALALL = 4;
public static final int CONTENTDOM_TEXT = 0;
public static final int CONTENTDOM_IMAGE = 1;
public static final int CONTENTDOM_AUDIO = 2;
public static final int CONTENTDOM_VIDEO = 3;
public static final int CONTENTDOM_APP = 4;
public static final kelondroBitfield empty_constraint = new kelondroBitfield(4, "AAAAAA");
public static final kelondroBitfield catchall_constraint = new kelondroBitfield(4, "______");
public Set queryWords, queryHashes;
public int wantedResults;
public String prefer;
public int contentdom;
public long maximumTime;
public String urlMask;
public int domType;
@ -75,13 +82,14 @@ public final class plasmaSearchQuery {
public int maxDistance;
public kelondroBitfield constraint;
public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer,
public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer, int contentdom,
int wantedResults, long maximumTime, String urlMask,
int domType, String domGroupName, int domMaxTargets,
kelondroBitfield constraint) {
this.queryWords = queryWords;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.contentdom = contentdom;
this.queryHashes = plasmaCondenser.words2hashes(queryWords);
this.wantedResults = wantedResults;
this.maximumTime = maximumTime;
@ -92,12 +100,13 @@ public final class plasmaSearchQuery {
this.constraint = constraint;
}
public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer,
public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer, int contentdom,
int wantedResults, long maximumTime, String urlMask,
kelondroBitfield constraint) {
this.queryWords = null;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.contentdom = contentdom;
this.queryHashes = queryHashes;
this.wantedResults = wantedResults;
this.maximumTime = maximumTime;
@ -108,6 +117,15 @@ public final class plasmaSearchQuery {
this.constraint = constraint;
}
public static int contentdomParser(String dom) {
if (dom.equals("text")) return CONTENTDOM_TEXT;
else if (dom.equals("image")) return CONTENTDOM_IMAGE;
else if (dom.equals("audio")) return CONTENTDOM_AUDIO;
else if (dom.equals("video")) return CONTENTDOM_VIDEO;
else if (dom.equals("app")) return CONTENTDOM_APP;
return CONTENTDOM_TEXT;
}
public static Set hashes2Set(String query) {
if (query == null) return new HashSet();
final HashSet keyhashes = new HashSet(query.length() / yacySeedDB.commonHashLength);

@ -1565,7 +1565,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
plasmaCondenser condenser = new plasmaCondenser(document.getText(), document.charset);
plasmaCondenser condenser = new plasmaCondenser(document);
// generate citation reference
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
@ -1593,10 +1593,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaURL.language(entry.url()), // language
ioLinks[0].intValue(), // llocal
ioLinks[1].intValue(), // lother
document.audiolinks.size(), // laudio
document.imagelinks.size(), // limage
document.videolinks.size(), // lvideo
document.applinks.size() // lapp
document.getAudiolinks().size(), // laudio
document.getImages().size(), // limage
document.getVideolinks().size(), // lvideo
document.getApplinks().size() // lapp
);
/* ========================================================================
* STORE URL TO LOADED-URL-DB
@ -1751,9 +1751,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
" [" + entry.urlHash() + "]" +
"\n\tDescription: " + docDescription +
"\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getSourceCharset() + " | " +
"\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.getTextLength() + " bytes | " +
"Anchors: " + ((document.anchors==null)?0:document.anchors.size()) +
"Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
"\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " +
"ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " +
"IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " +
@ -2239,13 +2239,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// parse the resource
plasmaParserDocument document = snippetCache.parseDocument(comp.url(), resourceContentLength.longValue(), resourceContent);
// getting parsed body input stream
InputStream docBodyInputStream = document.getText();
// getting word iterator
Iterator witer = null;
try {
witer = plasmaCondenser.getWords(docBodyInputStream, document.charset);
witer = new plasmaCondenser(document).words();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}

@ -165,7 +165,7 @@ public class nxTools {
e = s;
while (e < a.length) {
b = a[e];
if ((b == 10) || (b == 13)) break;
if ((b == 10) || (b == 13) || (b == 0)) break;
e++;
}

Loading…
Cancel
Save