diff --git a/htroot/js/yacyinteractive.js b/htroot/js/yacyinteractive.js index f7d979175..71bcde91d 100644 --- a/htroot/js/yacyinteractive.js +++ b/htroot/js/yacyinteractive.js @@ -36,7 +36,7 @@ function search(search, count, offset) { } else if (window.ActiveXObject) { // IE self.xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP"); } - self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&nav=all&maximumRecords=" + maximumRecords + "&startRecord=" + startRecord + "&query=" + query, true); + self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&nav=all&contentdom=all&maximumRecords=" + maximumRecords + "&startRecord=" + startRecord + "&query=" + query, true); self.xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded'); self.xmlHttpReq.onreadystatechange = function() { if (self.xmlHttpReq.readyState == 4) { diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index bb6265536..da33bc4ff 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -119,7 +119,7 @@ public final class search { final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); final String prefer = post.get("prefer", ""); final String modifier = post.get("modifier", "").trim(); - final String contentdom = post.get("contentdom", "text"); + final String contentdom = post.get("contentdom", "all"); final String filter = post.get("filter", ".*"); // a filter on the url final Pattern snippetPattern = Pattern.compile(post.get("snippet", ".*")); // a filter on the snippet String sitehash = post.get("sitehash", ""); if (sitehash.length() == 0) sitehash = null; diff --git a/htroot/yacyinteractive.html b/htroot/yacyinteractive.html index f24120475..f0309e03c 100644 --- a/htroot/yacyinteractive.html +++ b/htroot/yacyinteractive.html @@ -56,6 +56,9 @@ To see a list of all APIs, please visit the + + + #(allowrealtime)# :: diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index c4ff6c123..8cb0468ba 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -272,7 +272,7 @@ public class yacysearch { // find search domain final Classification.ContentDomain contentdom = - ContentDomain.contentdomParser(post == null ? "text" : post.get("contentdom", "text")); + ContentDomain.contentdomParser(post == null ? "all" : post.get("contentdom", "all")); // patch until better search profiles are available if ( contentdom == ContentDomain.TEXT ) { diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 93577644a..f57ab8be5 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -105,7 +105,7 @@ public class yacysearchitem { prop.put("navurlBase", QueryParams.navurlBase("html", theQuery, null, theQuery.urlMask.toString(), theQuery.navigators).toString()); final String target_special_pattern = sb.getConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, ""); - if (theQuery.contentdom == Classification.ContentDomain.TEXT) { + if (theQuery.contentdom == Classification.ContentDomain.TEXT || theQuery.contentdom == Classification.ContentDomain.ALL) { // text search // generate result object diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 3c3c53cbc..6c5e96080 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -39,6 +39,7 @@ import java.util.Properties; import java.util.concurrent.BlockingQueue; import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.Classification.ContentDomain; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.Domains; @@ -353,9 +354,11 @@ public final class CrawlStacker { // check availability of parser and maxfilesize String warning = null; - if (entry.size() > maxFileSize /*|| - (entry.url().getFileExtension().length() > 0 && TextParser.supports(entry.url(), null) != null) - */) { + if (entry.size() > maxFileSize || + entry.url().getContentDomain() == ContentDomain.APP || + entry.url().getContentDomain() == ContentDomain.IMAGE || + entry.url().getContentDomain() == ContentDomain.AUDIO || + entry.url().getContentDomain() == ContentDomain.VIDEO ) { warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry); //if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning); return null; diff --git a/source/net/yacy/cora/document/Classification.java b/source/net/yacy/cora/document/Classification.java index c70f4b1e0..f0caad698 100644 --- a/source/net/yacy/cora/document/Classification.java +++ b/source/net/yacy/cora/document/Classification.java @@ -24,11 +24,13 @@ import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.util.HashSet; +import java.util.Map.Entry; import java.util.Properties; import java.util.Set; public class Classification { + private static final Set textExtSet = new HashSet(); private static final Set mediaExtSet = new HashSet(); private static final Set imageExtSet = new HashSet(); private static final Set audioExtSet = new HashSet(); @@ -78,11 +80,13 @@ public class Classification { static { - final String apps = "7z,ace,arc,arj,apk,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip"; + final String text = "htm,html,phtml,shtml,xhtml,php,php3,php4,php5,cfm,asp,aspx,tex,txt,jsp,mf,asp,aspx,csv,gpx,vcf,xsl,xml,pdf,doc,docx,xls,xlsx,ppt,pptx"; + final String apps = "7z,ace,arc,arj,apk,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,java,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip"; final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,sid,wav,wma"; final String video = "3g2,3gp,3gp2,3gpp,3gpp2,3ivx,asf,asx,avi,div,divx,dv,dvx,env,f4v,flv,hdmov,m1v,m4v,m-jpeg,moov,mov,movie,mp2v,mp4,mpe,mpeg,mpg,mpg4,mv4,ogm,ogv,qt,rm,rv,vid,swf,wmv"; final String image = "ai,bmp,cdr,cmx,emf,eps,gif,img,jpeg,jpg,mng,pct,pdd,pdn,pict,png,psb,psd,psp,tif,tiff,wmf"; + addSet(textExtSet, text); // image formats addSet(imageExtSet, image); // image formats addSet(audioExtSet, audio); // audio formats addSet(videoExtSet, video); // video formats @@ -95,6 +99,11 @@ public class Classification { for (String s: extString.split(",")) set.add(s.toLowerCase().trim()); } + public static boolean isTextExtension(String textExt) { + if (textExt == null) return false; + return textExtSet.contains(textExt.trim().toLowerCase()); + } + public static boolean isMediaExtension(String mediaExt) { if (mediaExt == null) return false; return mediaExtSet.contains(mediaExt.trim().toLowerCase()); @@ -120,12 +129,20 @@ public class Classification { return appsExtSet.contains(appsExt.trim().toLowerCase()); } + public static ContentDomain getContentDomain(String ext) { + if (isTextExtension(ext)) return ContentDomain.TEXT; + if (isImageExtension(ext)) return ContentDomain.IMAGE; + if (isAudioExtension(ext)) return ContentDomain.AUDIO; + if (isVideoExtension(ext)) return ContentDomain.VIDEO; + if (isApplicationExtension(ext)) return ContentDomain.APP; + return ContentDomain.ALL; + } + public static boolean isPictureMime(final String mimeType) { if (mimeType == null) return false; return mimeType.toUpperCase().startsWith("IMAGE"); } - private static final Properties mimeTable = new Properties(); public static void init(final File mimeFile) { @@ -140,6 +157,14 @@ public class Classification { if (mimeTableInputStream != null) try { mimeTableInputStream.close(); } catch (final Exception e1) {} } } + for (Entry entry: mimeTable.entrySet()) { + String ext = (String) entry.getKey(); + String mime = (String) entry.getValue(); + if (mime.startsWith("text/")) textExtSet.add(ext.toLowerCase()); + if (mime.startsWith("audio/")) audioExtSet.add(ext.toLowerCase()); + if (mime.startsWith("video/")) videoExtSet.add(ext.toLowerCase()); + if (mime.startsWith("application/")) appsExtSet.add(ext.toLowerCase()); + } } public static int countMimes() { diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index cc6d67d24..f0c79bb89 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -45,6 +45,7 @@ import java.util.regex.Pattern; import jcifs.smb.SmbException; import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; +import net.yacy.cora.document.Classification.ContentDomain; import net.yacy.cora.document.Punycode.PunycodeException; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.TimeoutRequest; @@ -89,6 +90,7 @@ public class MultiProtocolURI implements Serializable, Comparable getAudiolinks(final Document[] documents) { + final Map result = new HashMap(); + for (final Document d: documents) result.putAll(d.audiolinks); + return result; + } + + public static Map getVideolinks(final Document[] documents) { + final Map result = new HashMap(); + for (final Document d: documents) result.putAll(d.videolinks); + return result; + } + + public static Map getApplinks(final Document[] documents) { + final Map result = new HashMap(); + for (final Document d: documents) result.putAll(d.applinks); + return result; + } } diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 6de5f4a00..c74b77f34 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -589,6 +589,7 @@ public final class Protocol final String language, final String sitehash, final String authorhash, + final String contentdom, final int count, final long time, final int maxDistance, @@ -634,6 +635,7 @@ public final class Protocol language, sitehash, authorhash, + contentdom, count, time, maxDistance, @@ -893,6 +895,7 @@ public final class Protocol final String language, final String sitehash, final String authorhash, + final String contentdom, final int count, final long time, final int maxDistance, @@ -945,6 +948,7 @@ public final class Protocol parts.put("language", UTF8.StringBody(language)); parts.put("sitehash", UTF8.StringBody(sitehash)); parts.put("authorhash", UTF8.StringBody(authorhash)); + parts.put("contentdom", UTF8.StringBody(contentdom)); parts.put("ttl", UTF8.StringBody("0")); parts.put("maxdist", UTF8.StringBody(Integer.toString(maxDistance))); parts.put("profile", UTF8.StringBody(crypt.simpleEncode(rankingProfile.toExternalString()))); @@ -1516,6 +1520,7 @@ public final class Protocol "", // language, "", // sitehash, "", // authorhash, + "all", // contentdom, 10, // count, 3000, // time, 1000, // maxDistance, diff --git a/source/net/yacy/peers/RemoteSearch.java b/source/net/yacy/peers/RemoteSearch.java index cfacb3f4c..0db60280c 100644 --- a/source/net/yacy/peers/RemoteSearch.java +++ b/source/net/yacy/peers/RemoteSearch.java @@ -46,7 +46,7 @@ public class RemoteSearch extends Thread { private static final ThreadGroup ysThreadGroup = new ThreadGroup("yacySearchThreadGroup"); - final private String wordhashes, excludehashes, urlhashes, sitehash, authorhash; + final private String wordhashes, excludehashes, urlhashes, sitehash, authorhash, contentdom; final private boolean global; final private int partitions; final private Segment indexSegment; @@ -72,7 +72,7 @@ public class RemoteSearch extends Thread { final Pattern snippet, final QueryParams.Modifier modifier, final String language, - final String sitehash, final String authorhash, + final String sitehash, final String authorhash, final String contentdom, final int count, final long time, final int maxDistance, final boolean global, final int partitions, final Seed targetPeer, @@ -96,6 +96,7 @@ public class RemoteSearch extends Thread { this.language = language; this.sitehash = sitehash; this.authorhash = authorhash; + this.contentdom = contentdom; this.global = global; this.partitions = partitions; this.indexSegment = indexSegment; @@ -120,7 +121,7 @@ public class RemoteSearch extends Thread { this.peers.mySeed(), this.wordhashes, this.excludehashes, this.urlhashes, this.prefer, this.filter, this.snippet, this.modifier.getModifier(), - this.language, this.sitehash, this.authorhash, + this.language, this.sitehash, this.authorhash, this.contentdom, this.count, this.time, this.maxDistance, this.global, this.partitions, this.targetPeer, this.indexSegment, this.containerCache, this.secondarySearchSuperviser, this.blacklist, this.rankingProfile, this.constraint); @@ -166,6 +167,7 @@ public class RemoteSearch extends Thread { final String language, final String sitehash, final String authorhash, + final String contentdom, final int count, final long time, final int maxDist, final Segment indexSegment, final SeedDB peers, @@ -200,7 +202,7 @@ public class RemoteSearch extends Thread { try { RemoteSearch rs = new RemoteSearch( wordhashes, excludehashes, "", prefer, filter, snippet, modifier, - language, sitehash, authorhash, + language, sitehash, authorhash, contentdom, count, time, maxDist, true, targets, targetPeers[i], indexSegment, peers, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint); rs.start(); @@ -233,7 +235,7 @@ public class RemoteSearch extends Thread { if (targetPeer == null || targetPeer.hash == null) return null; if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(ASCII.getBytes(targetPeer.hash))); final RemoteSearch searchThread = new RemoteSearch( - wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, new QueryParams.Modifier(""), "", "", "", 20, time, 9999, true, 0, targetPeer, + wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, new QueryParams.Modifier(""), "", "", "", "all", 20, time, 9999, true, 0, targetPeer, indexSegment, peers, containerCache, null, blacklist, rankingProfile, constraint); searchThread.start(); return searchThread; diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 291c83cbc..ce4252d9b 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2355,8 +2355,11 @@ public final class Switchboard extends serverSwitch // get the hyperlinks final Map hl = Document.getHyperlinks(documents); - // add all images also to the crawl stack + // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links hl.putAll(Document.getImagelinks(documents)); + hl.putAll(Document.getApplinks(documents)); + hl.putAll(Document.getVideolinks(documents)); + hl.putAll(Document.getAudiolinks(documents)); // insert those hyperlinks to the crawler MultiProtocolURI nextUrl; diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java index dce9fcf8e..648bdedc2 100644 --- a/source/net/yacy/search/query/RWIProcess.java +++ b/source/net/yacy/search/query/RWIProcess.java @@ -289,7 +289,7 @@ public final class RWIProcess extends Thread } // check document domain - if ( this.query.contentdom != Classification.ContentDomain.TEXT ) { + if ( this.query.contentdom != Classification.ContentDomain.ALL ) { if ( (this.query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio))) ) { continue pollloop; @@ -588,6 +588,12 @@ public final class RWIProcess extends Thread continue; // rare case where the url is corrupted } + // check content domain + if (this.query.contentdom != Classification.ContentDomain.ALL && page.url().getContentDomain() != this.query.contentdom) { + this.sortout++; + continue; + } + final String pageurl = page.url().toNormalform(true, true); final String pageauthor = page.dc_creator(); final String pagetitle = page.dc_title().toLowerCase(); @@ -620,15 +626,6 @@ public final class RWIProcess extends Thread continue; } - // check content domain - if ( (this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) - || (this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) - || (this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0) - || (this.query.contentdom == ContentDomain.APP && page.lapp() == 0) ) { - this.sortout++; - continue; - } - // check vocabulary constraint final String tags = page.dc_subject(); final String[] taglist = tags == null || tags.length() == 0 ? new String[0] : SPACE_PATTERN.split(page.dc_subject()); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 5e2887812..e49f072e8 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -176,6 +176,7 @@ public final class SearchEvent SearchEvent.this.query.targetlang == null ? "" : SearchEvent.this.query.targetlang, SearchEvent.this.query.sitehash == null ? "" : SearchEvent.this.query.sitehash, SearchEvent.this.query.authorhash == null ? "" : SearchEvent.this.query.authorhash, + SearchEvent.this.query.contentdom == null ? "all" : SearchEvent.this.query.contentdom.toString(), remote_maxcount, remote_maxtime, SearchEvent.this.query.maxDistance, diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index 745b113b2..554f0cca6 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -580,7 +580,7 @@ public class SnippetProcess { } // load snippet - if (this.query.contentdom == Classification.ContentDomain.TEXT) { + if (page.url().getContentDomain() == Classification.ContentDomain.TEXT) { // attach text snippet startTime = System.currentTimeMillis(); final TextSnippet snippet = new TextSnippet( @@ -612,7 +612,7 @@ public class SnippetProcess { Log.logInfo("SEARCH", "sorted out url " + page.url().toNormalform(true, false) + " during search: " + reason); return null; } - } else { + } else if (page.url().getContentDomain() == Classification.ContentDomain.IMAGE) { // attach media information startTime = System.currentTimeMillis(); final List mediaSnippets = MediaSnippet.retrieveMediaSnippets(page.url(), this.snippetFetchWordHashes, this.query.contentdom, cacheStrategy, 6000, !this.query.isLocal()); @@ -633,6 +633,8 @@ public class SnippetProcess { Log.logInfo("SEARCH", "sorted out url " + page.url().toNormalform(true, false) + " during search: " + reason); return null; } + } else { + return new ResultEntry(page, this.query.getSegment(), this.peers, null, null, dbRetrievalTime, 0); // result without snippet } // finished, no more actions possible here }