new indexing strategy: ALL links that appear anywhere are indexed, not

only links where the content can be parsed. All non-parseable links are
placed into the noload queue. The search process must therefore be able
to filter out non-text search results.
- This fixes the problem that image search results appeared in the text
search.
- The interactive search can retrieve now ALL types of links
- The p2p interface is now extended to retrieve only certain types of
links (text, image, video, apps)
- The search process has an extension to filter the right document type
according to the search query
pull/1/head
Michael Peter Christen 13 years ago
parent 14f67f217c
commit f8cd57c92f

@ -36,7 +36,7 @@ function search(search, count, offset) {
} else if (window.ActiveXObject) { // IE
self.xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP");
}
self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&nav=all&maximumRecords=" + maximumRecords + "&startRecord=" + startRecord + "&query=" + query, true);
self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&nav=all&contentdom=all&maximumRecords=" + maximumRecords + "&startRecord=" + startRecord + "&query=" + query, true);
self.xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded');
self.xmlHttpReq.onreadystatechange = function() {
if (self.xmlHttpReq.readyState == 4) {

@ -119,7 +119,7 @@ public final class search {
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
final String prefer = post.get("prefer", "");
final String modifier = post.get("modifier", "").trim();
final String contentdom = post.get("contentdom", "text");
final String contentdom = post.get("contentdom", "all");
final String filter = post.get("filter", ".*"); // a filter on the url
final Pattern snippetPattern = Pattern.compile(post.get("snippet", ".*")); // a filter on the snippet
String sitehash = post.get("sitehash", ""); if (sitehash.length() == 0) sitehash = null;

@ -56,6 +56,9 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<fieldset class="yacys">
<input type="hidden" name="maximumRecords" value="#[maximumRecords]#" />
<input type="hidden" name="startRecord" value="#[startRecord]#" />
<input type="hidden" name="verify", value="false" />
<input type="hidden" name="resource", value="local" />
<input type="hidden" name="contentdom", value="all" />
<input id="search" class="searchinput" name="query" type="text" value="#[query]#" size="40" maxlength="80" onFocus="this.select()" />
#(allowrealtime)#
<input id="Enter" type="submit" name="Enter" value="Search" />::

@ -272,7 +272,7 @@ public class yacysearch {
// find search domain
final Classification.ContentDomain contentdom =
ContentDomain.contentdomParser(post == null ? "text" : post.get("contentdom", "text"));
ContentDomain.contentdomParser(post == null ? "all" : post.get("contentdom", "all"));
// patch until better search profiles are available
if ( contentdom == ContentDomain.TEXT ) {

@ -105,7 +105,7 @@ public class yacysearchitem {
prop.put("navurlBase", QueryParams.navurlBase("html", theQuery, null, theQuery.urlMask.toString(), theQuery.navigators).toString());
final String target_special_pattern = sb.getConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, "");
if (theQuery.contentdom == Classification.ContentDomain.TEXT) {
if (theQuery.contentdom == Classification.ContentDomain.TEXT || theQuery.contentdom == Classification.ContentDomain.ALL) {
// text search
// generate result object

@ -39,6 +39,7 @@ import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.Domains;
@ -353,9 +354,11 @@ public final class CrawlStacker {
// check availability of parser and maxfilesize
String warning = null;
if (entry.size() > maxFileSize /*||
(entry.url().getFileExtension().length() > 0 && TextParser.supports(entry.url(), null) != null)
*/) {
if (entry.size() > maxFileSize ||
entry.url().getContentDomain() == ContentDomain.APP ||
entry.url().getContentDomain() == ContentDomain.IMAGE ||
entry.url().getContentDomain() == ContentDomain.AUDIO ||
entry.url().getContentDomain() == ContentDomain.VIDEO ) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry);
//if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
return null;

@ -24,11 +24,13 @@ import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.HashSet;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
public class Classification {
private static final Set<String> textExtSet = new HashSet<String>();
private static final Set<String> mediaExtSet = new HashSet<String>();
private static final Set<String> imageExtSet = new HashSet<String>();
private static final Set<String> audioExtSet = new HashSet<String>();
@ -78,11 +80,13 @@ public class Classification {
static {
final String apps = "7z,ace,arc,arj,apk,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip";
final String text = "htm,html,phtml,shtml,xhtml,php,php3,php4,php5,cfm,asp,aspx,tex,txt,jsp,mf,asp,aspx,csv,gpx,vcf,xsl,xml,pdf,doc,docx,xls,xlsx,ppt,pptx";
final String apps = "7z,ace,arc,arj,apk,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,java,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip";
final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,sid,wav,wma";
final String video = "3g2,3gp,3gp2,3gpp,3gpp2,3ivx,asf,asx,avi,div,divx,dv,dvx,env,f4v,flv,hdmov,m1v,m4v,m-jpeg,moov,mov,movie,mp2v,mp4,mpe,mpeg,mpg,mpg4,mv4,ogm,ogv,qt,rm,rv,vid,swf,wmv";
final String image = "ai,bmp,cdr,cmx,emf,eps,gif,img,jpeg,jpg,mng,pct,pdd,pdn,pict,png,psb,psd,psp,tif,tiff,wmf";
addSet(textExtSet, text); // image formats
addSet(imageExtSet, image); // image formats
addSet(audioExtSet, audio); // audio formats
addSet(videoExtSet, video); // video formats
@ -95,6 +99,11 @@ public class Classification {
for (String s: extString.split(",")) set.add(s.toLowerCase().trim());
}
public static boolean isTextExtension(String textExt) {
if (textExt == null) return false;
return textExtSet.contains(textExt.trim().toLowerCase());
}
public static boolean isMediaExtension(String mediaExt) {
if (mediaExt == null) return false;
return mediaExtSet.contains(mediaExt.trim().toLowerCase());
@ -120,12 +129,20 @@ public class Classification {
return appsExtSet.contains(appsExt.trim().toLowerCase());
}
public static ContentDomain getContentDomain(String ext) {
if (isTextExtension(ext)) return ContentDomain.TEXT;
if (isImageExtension(ext)) return ContentDomain.IMAGE;
if (isAudioExtension(ext)) return ContentDomain.AUDIO;
if (isVideoExtension(ext)) return ContentDomain.VIDEO;
if (isApplicationExtension(ext)) return ContentDomain.APP;
return ContentDomain.ALL;
}
public static boolean isPictureMime(final String mimeType) {
if (mimeType == null) return false;
return mimeType.toUpperCase().startsWith("IMAGE");
}
private static final Properties mimeTable = new Properties();
public static void init(final File mimeFile) {
@ -140,6 +157,14 @@ public class Classification {
if (mimeTableInputStream != null) try { mimeTableInputStream.close(); } catch (final Exception e1) {}
}
}
for (Entry<Object, Object> entry: mimeTable.entrySet()) {
String ext = (String) entry.getKey();
String mime = (String) entry.getValue();
if (mime.startsWith("text/")) textExtSet.add(ext.toLowerCase());
if (mime.startsWith("audio/")) audioExtSet.add(ext.toLowerCase());
if (mime.startsWith("video/")) videoExtSet.add(ext.toLowerCase());
if (mime.startsWith("application/")) appsExtSet.add(ext.toLowerCase());
}
}
public static int countMimes() {

@ -45,6 +45,7 @@ import java.util.regex.Pattern;
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileInputStream;
import net.yacy.cora.document.Classification.ContentDomain;
import net.yacy.cora.document.Punycode.PunycodeException;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.TimeoutRequest;
@ -89,6 +90,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
protected String host, path, quest, ref;
protected int port;
protected InetAddress hostAddress;
protected ContentDomain contentDomain;
/**
* initialization of a MultiProtocolURI to produce poison pills for concurrent blocking queues
@ -101,6 +103,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
this.path = null;
this.quest = null;
this.ref = null;
this.contentDomain = null;
this.port = -1;
}
@ -116,6 +119,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
this.path = url.path;
this.quest = url.quest;
this.ref = url.ref;
this.contentDomain = null;
this.port = url.port;
}
@ -123,6 +127,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
if (url == null) throw new MalformedURLException("url string is null");
this.hostAddress = null;
this.contentDomain = null;
// identify protocol
assert (url != null);
@ -258,6 +263,13 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
public final boolean isFile() { return this.protocol.equals("file"); }
public final boolean isSMB() { return this.protocol.equals("smb"); }
public final ContentDomain getContentDomain() {
if (this.contentDomain == null) {
this.contentDomain = Classification.getContentDomain(this.getFileExtension());
}
return this.contentDomain;
}
public static MultiProtocolURI newURL(final String baseURL, final String relPath) throws MalformedURLException {
if ((baseURL == null) ||
isHTTP(relPath) ||

@ -38,6 +38,7 @@ import java.util.SortedSet;
import java.util.TreeMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.language.Identificator;
@ -112,10 +113,10 @@ public final class Condenser {
this.RESULT_FLAGS = new Bitfield(4);
// construct flag set for document
if (!document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
if (!document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (!document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (!document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
if (document.lat() != 0.0f && document.lon() != 0.0f) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
this.languageIdentificator = new Identificator();

@ -833,5 +833,22 @@ dc_rights
return result;
}
public static Map<MultiProtocolURI, String> getAudiolinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) result.putAll(d.audiolinks);
return result;
}
public static Map<MultiProtocolURI, String> getVideolinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) result.putAll(d.videolinks);
return result;
}
public static Map<MultiProtocolURI, String> getApplinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) result.putAll(d.applinks);
return result;
}
}

@ -589,6 +589,7 @@ public final class Protocol
final String language,
final String sitehash,
final String authorhash,
final String contentdom,
final int count,
final long time,
final int maxDistance,
@ -634,6 +635,7 @@ public final class Protocol
language,
sitehash,
authorhash,
contentdom,
count,
time,
maxDistance,
@ -893,6 +895,7 @@ public final class Protocol
final String language,
final String sitehash,
final String authorhash,
final String contentdom,
final int count,
final long time,
final int maxDistance,
@ -945,6 +948,7 @@ public final class Protocol
parts.put("language", UTF8.StringBody(language));
parts.put("sitehash", UTF8.StringBody(sitehash));
parts.put("authorhash", UTF8.StringBody(authorhash));
parts.put("contentdom", UTF8.StringBody(contentdom));
parts.put("ttl", UTF8.StringBody("0"));
parts.put("maxdist", UTF8.StringBody(Integer.toString(maxDistance)));
parts.put("profile", UTF8.StringBody(crypt.simpleEncode(rankingProfile.toExternalString())));
@ -1516,6 +1520,7 @@ public final class Protocol
"", // language,
"", // sitehash,
"", // authorhash,
"all", // contentdom,
10, // count,
3000, // time,
1000, // maxDistance,

@ -46,7 +46,7 @@ public class RemoteSearch extends Thread {
private static final ThreadGroup ysThreadGroup = new ThreadGroup("yacySearchThreadGroup");
final private String wordhashes, excludehashes, urlhashes, sitehash, authorhash;
final private String wordhashes, excludehashes, urlhashes, sitehash, authorhash, contentdom;
final private boolean global;
final private int partitions;
final private Segment indexSegment;
@ -72,7 +72,7 @@ public class RemoteSearch extends Thread {
final Pattern snippet,
final QueryParams.Modifier modifier,
final String language,
final String sitehash, final String authorhash,
final String sitehash, final String authorhash, final String contentdom,
final int count, final long time, final int maxDistance,
final boolean global, final int partitions,
final Seed targetPeer,
@ -96,6 +96,7 @@ public class RemoteSearch extends Thread {
this.language = language;
this.sitehash = sitehash;
this.authorhash = authorhash;
this.contentdom = contentdom;
this.global = global;
this.partitions = partitions;
this.indexSegment = indexSegment;
@ -120,7 +121,7 @@ public class RemoteSearch extends Thread {
this.peers.mySeed(),
this.wordhashes, this.excludehashes, this.urlhashes,
this.prefer, this.filter, this.snippet, this.modifier.getModifier(),
this.language, this.sitehash, this.authorhash,
this.language, this.sitehash, this.authorhash, this.contentdom,
this.count, this.time, this.maxDistance, this.global, this.partitions,
this.targetPeer, this.indexSegment, this.containerCache, this.secondarySearchSuperviser,
this.blacklist, this.rankingProfile, this.constraint);
@ -166,6 +167,7 @@ public class RemoteSearch extends Thread {
final String language,
final String sitehash,
final String authorhash,
final String contentdom,
final int count, final long time, final int maxDist,
final Segment indexSegment,
final SeedDB peers,
@ -200,7 +202,7 @@ public class RemoteSearch extends Thread {
try {
RemoteSearch rs = new RemoteSearch(
wordhashes, excludehashes, "", prefer, filter, snippet, modifier,
language, sitehash, authorhash,
language, sitehash, authorhash, contentdom,
count, time, maxDist, true, targets, targetPeers[i],
indexSegment, peers, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint);
rs.start();
@ -233,7 +235,7 @@ public class RemoteSearch extends Thread {
if (targetPeer == null || targetPeer.hash == null) return null;
if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(ASCII.getBytes(targetPeer.hash)));
final RemoteSearch searchThread = new RemoteSearch(
wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, new QueryParams.Modifier(""), "", "", "", 20, time, 9999, true, 0, targetPeer,
wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, new QueryParams.Modifier(""), "", "", "", "all", 20, time, 9999, true, 0, targetPeer,
indexSegment, peers, containerCache, null, blacklist, rankingProfile, constraint);
searchThread.start();
return searchThread;

@ -2355,8 +2355,11 @@ public final class Switchboard extends serverSwitch
// get the hyperlinks
final Map<MultiProtocolURI, String> hl = Document.getHyperlinks(documents);
// add all images also to the crawl stack
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
hl.putAll(Document.getImagelinks(documents));
hl.putAll(Document.getApplinks(documents));
hl.putAll(Document.getVideolinks(documents));
hl.putAll(Document.getAudiolinks(documents));
// insert those hyperlinks to the crawler
MultiProtocolURI nextUrl;

@ -289,7 +289,7 @@ public final class RWIProcess extends Thread
}
// check document domain
if ( this.query.contentdom != Classification.ContentDomain.TEXT ) {
if ( this.query.contentdom != Classification.ContentDomain.ALL ) {
if ( (this.query.contentdom == ContentDomain.AUDIO)
&& (!(iEntry.flags().get(Condenser.flag_cat_hasaudio))) ) {
continue pollloop;
@ -588,6 +588,12 @@ public final class RWIProcess extends Thread
continue; // rare case where the url is corrupted
}
// check content domain
if (this.query.contentdom != Classification.ContentDomain.ALL && page.url().getContentDomain() != this.query.contentdom) {
this.sortout++;
continue;
}
final String pageurl = page.url().toNormalform(true, true);
final String pageauthor = page.dc_creator();
final String pagetitle = page.dc_title().toLowerCase();
@ -620,15 +626,6 @@ public final class RWIProcess extends Thread
continue;
}
// check content domain
if ( (this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0)
|| (this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0)
|| (this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0)
|| (this.query.contentdom == ContentDomain.APP && page.lapp() == 0) ) {
this.sortout++;
continue;
}
// check vocabulary constraint
final String tags = page.dc_subject();
final String[] taglist = tags == null || tags.length() == 0 ? new String[0] : SPACE_PATTERN.split(page.dc_subject());

@ -176,6 +176,7 @@ public final class SearchEvent
SearchEvent.this.query.targetlang == null ? "" : SearchEvent.this.query.targetlang,
SearchEvent.this.query.sitehash == null ? "" : SearchEvent.this.query.sitehash,
SearchEvent.this.query.authorhash == null ? "" : SearchEvent.this.query.authorhash,
SearchEvent.this.query.contentdom == null ? "all" : SearchEvent.this.query.contentdom.toString(),
remote_maxcount,
remote_maxtime,
SearchEvent.this.query.maxDistance,

@ -580,7 +580,7 @@ public class SnippetProcess {
}
// load snippet
if (this.query.contentdom == Classification.ContentDomain.TEXT) {
if (page.url().getContentDomain() == Classification.ContentDomain.TEXT) {
// attach text snippet
startTime = System.currentTimeMillis();
final TextSnippet snippet = new TextSnippet(
@ -612,7 +612,7 @@ public class SnippetProcess {
Log.logInfo("SEARCH", "sorted out url " + page.url().toNormalform(true, false) + " during search: " + reason);
return null;
}
} else {
} else if (page.url().getContentDomain() == Classification.ContentDomain.IMAGE) {
// attach media information
startTime = System.currentTimeMillis();
final List<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(page.url(), this.snippetFetchWordHashes, this.query.contentdom, cacheStrategy, 6000, !this.query.isLocal());
@ -633,6 +633,8 @@ public class SnippetProcess {
Log.logInfo("SEARCH", "sorted out url " + page.url().toNormalform(true, false) + " during search: " + reason);
return null;
}
} else {
return new ResultEntry(page, this.query.getSegment(), this.peers, null, null, dbRetrievalTime, 0); // result without snippet
}
// finished, no more actions possible here
}

Loading…
Cancel
Save