From 8e555d79a3ecc0a50be9c5f3e090b8e936b4320c Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 1 Oct 2015 13:03:22 +0200 Subject: [PATCH 1/6] add also 1-character tokens to the token list because that could be also searched for. A full-string search for a filename may fail if those 1-char tokens are omitted --- source/net/yacy/cora/document/id/MultiProtocolURL.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 1cf94289e..84298f3fd 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -995,7 +995,7 @@ public class MultiProtocolURL implements Serializable, Comparable 1) sb.append(v).append(' '); + for (final String v: token) if (v.length() >= 1) sb.append(v).append(' '); return sb.length() == 0 ? "" : sb.substring(0, sb.length() - 1); } From c737ff235d240b5efef7d6a6681303c4b5a9e03e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 1 Oct 2015 13:09:33 +0200 Subject: [PATCH 2/6] in case that the include_string contains several entries including 1-char tokens and also more-than-1-char tokens, then remove the 1-char tokens to prevent that we are to strict. This will make it possible to be a bit more fuzzy in the search where it is appropriate. --- source/net/yacy/search/query/QueryGoal.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index c17b72591..70b551c4c 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -184,6 +184,17 @@ public class QueryGoal { } } } + // in case that the include_string contains several entries including 1-char tokens and also more-than-1-char tokens, + // then remove the 1-char tokens to prevent that we are to strict. This will make it possible to be a bit more fuzzy + // in the search where it is appropriate + boolean contains_single = false, contains_multiple = false; + for (String token: include_string) { + if (token.length() == 1) contains_single = true; else contains_multiple = true; + } + if (contains_single && contains_multiple) { + Iterator i = include_string.iterator(); + while (i.hasNext()) if (i.next().length() == 1) i.remove(); + } } /** From 225200194a19cbc4c8232dae71c0616ff2a5a1c5 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 1 Oct 2015 13:18:44 +0200 Subject: [PATCH 3/6] every time a crawl is started, the user expects a different search result behaviour. This requires that the search cache is flushed for each crawl start. TODO: this should also be done if a crawl is terminated. --- htroot/Crawler_p.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 763b47305..7b5d705ba 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -66,6 +66,7 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment; +import net.yacy.search.query.SearchEventCache; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -76,8 +77,14 @@ public class Crawler_p { // this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; + + // clean up all search events + SearchEventCache.cleanupEvents(true); + sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings + // inital values for AJAX Elements (without JavaScript) final serverObjects prop = new serverObjects(); prop.put("rejected", 0); From 3d7dd9d3aaa219e656fa42cc3a0eeec0fad0db04 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 1 Oct 2015 13:21:28 +0200 Subject: [PATCH 4/6] follow-up to latest commit: also flush the search cache if all crawls had been terminated. --- source/net/yacy/search/Switchboard.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 3bd21d71e..1ecbffa5e 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2492,6 +2492,10 @@ public final class Switchboard extends serverSwitch { } } if (allCrawlsFinished) { + // refresh the search cache + SearchEventCache.cleanupEvents(true); + sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings + if (postprocessing) { // run postprocessing on all profiles ReferenceReportCache rrCache = index.getReferenceReportCache(); From d5330391de90dfc98f926548b23c4c059cdd3b64 Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 1 Oct 2015 23:11:58 +0200 Subject: [PATCH 5/6] remove some unused var allocation in parser --- source/net/yacy/document/content/DCEntry.java | 1 - .../net/yacy/document/parser/images/metadataImageParser.java | 2 +- source/net/yacy/document/parser/rssParser.java | 2 +- source/net/yacy/document/parser/sitemapParser.java | 2 +- source/net/yacy/document/parser/swfParser.java | 5 ++--- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index ded95a9d3..153900b4e 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -311,7 +311,6 @@ public class DCEntry extends MultiMapSolrParams { public double getLat() { String t = this.get("geo:lat"); - if (t == null) t = this.get("geo:lat"); t = stripCDATA(t); if (t == null) return 0.0d; return Double.parseDouble(t); diff --git a/source/net/yacy/document/parser/images/metadataImageParser.java b/source/net/yacy/document/parser/images/metadataImageParser.java index 04b20b948..d36a39cdd 100644 --- a/source/net/yacy/document/parser/images/metadataImageParser.java +++ b/source/net/yacy/document/parser/images/metadataImageParser.java @@ -169,7 +169,7 @@ public class metadataImageParser extends AbstractParser implements Parser { singleList(title), // title author == null ? "" : author, // author location.getHost(), // Publisher - new String[]{}, // sections + null, // sections descriptions, // description gpslon, gpslat, // location imgInfotxt.toString(), // content text diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 7005e85fe..8f0952bfb 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -94,7 +94,7 @@ public class rssParser extends AbstractParser implements Parser { singleList(item.getTitle()), item.getAuthor(), item.getCopyright(), - new String[0], + null, item.getDescriptions(), item.getLon(), item.getLat(), diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index 11742179f..5297d9893 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -96,7 +96,7 @@ public class sitemapParser extends AbstractParser implements Parser { singleList(""), "", "", - new String[0], + null, new ArrayList(), 0.0f, 0.0f, null, diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 502782b3b..154a85bf9 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -81,7 +81,6 @@ public class swfParser extends AbstractParser implements Parser { String url = null; String urlnr = null; final String linebreak = System.getProperty("line.separator"); - final String[] sections = null; final List abstrct = new ArrayList(); //TreeSet images = null; final List anchors = new ArrayList(); @@ -100,7 +99,7 @@ public class swfParser extends AbstractParser implements Parser { while ((urlStart = contents.indexOf("http://",urlEnd)) >= 0){ urlEnd = contents.indexOf(linebreak,urlStart); url = contents.substring(urlStart,urlEnd); - urlnr = Integer.toString(++urls).toString(); + urlnr = Integer.toString(++urls); AnchorURL u = new AnchorURL(url); u.setNameProperty(urlnr); anchors.add(u); @@ -122,7 +121,7 @@ public class swfParser extends AbstractParser implements Parser { replaceAll("\t"," ")), // title "", // TODO: AUTHOR "", - sections, // an array of section headlines + null, // an array of section headlines abstrct, // an abstract 0.0f, 0.0f, contents, // the parsed document text From 688f7b2a5c194124dc2a253be45a1af1b814bef2 Mon Sep 17 00:00:00 2001 From: reger Date: Fri, 2 Oct 2015 01:48:48 +0200 Subject: [PATCH 6/6] allow/display svg images in image results previews svg is not supported by awt but by most browser. Image content is delivered as received (without size adjustment) --- htroot/ViewImage.java | 5 ++++- htroot/yacysearchitem.java | 5 ++--- source/net/yacy/cora/document/analysis/Classification.java | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 8d80c1fac..932c86b43 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -136,8 +136,11 @@ public class ViewImage { // gif images are not loaded because of an animated gif bug within jvm which sends java into an endless loop with high CPU if (ext.equals("gif") && "gif".equals(MultiProtocolURL.getFileExtension(url.getFileName()))) { return new ByteArrayInputStream(resourceb); + } else if (ext.equals("svg") && "svg".equals(MultiProtocolURL.getFileExtension(url.getFileName()))) { + // svg images not supported by awt, but by most browser, deliver just content (without crop/scale) + return new ByteArrayInputStream(resourceb); } - + // read image image = ImageParser.parse(urlString, resourceb); if (image == null) { diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 7e2aaa8c5..ec50e3cc9 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -308,16 +308,15 @@ public class yacysearchitem { // image search; shows thumbnails prop.put("content", theSearch.query.contentdom.getCode() + 1); // switch on specific content - SearchEvent.ImageResult image = null; try { - image = theSearch.oneImageResult(item, timeout); + SearchEvent.ImageResult image = theSearch.oneImageResult(item, timeout); final String imageUrlstring = image.imageUrl.toNormalform(true); final String imageUrlExt = MultiProtocolURL.getFileExtension(image.imageUrl.getFileName()); final String target = sb.getConfig(imageUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self"); final String license = URLLicense.aquireLicense(image.imageUrl); // this is just the license key to get the image forwarded through the YaCy thumbnail viewer, not an actual lawful license //sb.loader.loadIfNotExistBackground(image.imageUrl, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent); - prop.putHTML("content_item_hrefCache", "/ViewImage." + ("gif".equals(imageUrlExt) ? "gif" : "png") + "?maxwidth=128&maxheight=128&code="+license+"&isStatic=true&quadratic=&url=" + imageUrlstring); + prop.putHTML("content_item_hrefCache", "ViewImage." + ("gif.png.svg".contains(imageUrlExt) ? imageUrlExt : "png") + "?maxwidth=128&maxheight=128&code="+license+"&isStatic=true&quadratic=&url=" + imageUrlstring); prop.putHTML("content_item_href", imageUrlstring); prop.putHTML("content_item_target", target); prop.put("content_item_code", license); diff --git a/source/net/yacy/cora/document/analysis/Classification.java b/source/net/yacy/cora/document/analysis/Classification.java index 52d68711e..a09519b5d 100644 --- a/source/net/yacy/cora/document/analysis/Classification.java +++ b/source/net/yacy/cora/document/analysis/Classification.java @@ -91,10 +91,10 @@ public class Classification { final String apps = "7z,ace,arc,arj,apk,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,java,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip"; final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,sid,wav,wma"; final String video = "3g2,3gp,3gp2,3gpp,3gpp2,3ivx,asf,asx,avi,div,divx,dv,dvx,env,f4v,flv,hdmov,m1v,m4v,m-jpeg,mkv,moov,mov,movie,mp2v,mp4,mpe,mpeg,mpg,mpg4,mv4,ogm,ogv,qt,rm,rv,vid,swf,webm,wmv"; - final String image = "ai,bmp,cdr,cmx,emf,eps,gif,img,jpeg,jpg,mng,pct,pdd,pdn,pict,png,psb,psd,psp,tif,tiff,wmf"; + final String image = "ai,bmp,cdr,cmx,emf,eps,gif,img,jpeg,jpg,mng,pct,pdd,pdn,pict,png,psb,psd,psp,svg,tif,tiff,wmf"; final String ctrl = "sha1,md5,crc32,sfv"; - addSet(textExtSet, text); // image formats + addSet(textExtSet, text); // text formats addSet(imageExtSet, image); // image formats addSet(audioExtSet, audio); // audio formats addSet(videoExtSet, video); // video formats