From 0a37d8af898760dbe6dac4cb712ec7e9edc365c7 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 5 Sep 2015 14:07:23 +0200 Subject: [PATCH 1/6] in case that a site crawl is started for urls with file:// path, the host filter does not work because there is no host given in such urls. In that case, patch the filter to be a sub-path filter. --- htroot/Crawler_p.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index d8fbcd50d..763b47305 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -219,8 +219,8 @@ public class Crawler_p { String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING); String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted - final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start - final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start + boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start + boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start final boolean restrictedcrawl = fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch); final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off")); @@ -261,6 +261,10 @@ public class Crawler_p { if (p >= 8) crawlName = crawlName.substring(0, p); } if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr; + // in case that a root url has a file protocol, then the site filter does not work, patch that: + if (fullDomain) { + for (DigestURL u: rootURLs) if (u.isFile()) {fullDomain = false; subPath = true; break;} + } // delete old robots entries for (DigestURL ru : rootURLs) { From c40c302748cfe76aba8687f052edb622f2ace025 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 5 Sep 2015 14:12:17 +0200 Subject: [PATCH 2/6] when many crawl queues are generated, this NPE can occur; probably caused as concurrency issue: W 2015/09/05 14:09:10 ConcurrentLog java.lang.NullPointerException java.lang.NullPointerException at java.util.TreeMap.rotateRight(TreeMap.java:2239) at java.util.TreeMap.fixAfterInsertion(TreeMap.java:2271) at java.util.TreeMap.put(TreeMap.java:582) at net.yacy.kelondro.table.Table.(Table.java:235) at net.yacy.crawler.HostQueue.openStack(HostQueue.java:229) at net.yacy.crawler.HostQueue.getStack(HostQueue.java:204) at net.yacy.crawler.HostQueue.push(HostQueue.java:397) at net.yacy.crawler.HostBalancer.push(HostBalancer.java:237) at net.yacy.crawler.data.NoticedURL.push(NoticedURL.java:184) at net.yacy.crawler.CrawlStacker.stackCrawl(CrawlStacker.java:355) at net.yacy.crawler.CrawlStacker.job(CrawlStacker.java:134) at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:483) at net.yacy.kelondro.workflow.InstantBlockingThread.job(InstantBlockingThread.java:101) at net.yacy.kelondro.workflow.AbstractBlockingThread.run(AbstractBlockingThread.java:82) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) --- source/net/yacy/kelondro/table/Table.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/net/yacy/kelondro/table/Table.java b/source/net/yacy/kelondro/table/Table.java index 212bd99b9..f7f77ac0b 100644 --- a/source/net/yacy/kelondro/table/Table.java +++ b/source/net/yacy/kelondro/table/Table.java @@ -232,7 +232,7 @@ public class Table implements Index, Iterable { } // track this table - tableTracker.put(tablefile.toString(), this); + synchronized (tableTracker) {tableTracker.put(tablefile.toString(), this);} } public synchronized void warmUp() { From 87688969753d258880fa10681868320b900fc9bc Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 6 Sep 2015 00:04:54 +0200 Subject: [PATCH 3/6] extract lastmodified from openoffice doc set lastmod date in office document parsers --- .../net/yacy/document/parser/docParser.java | 38 +++++++++---------- .../net/yacy/document/parser/odtParser.java | 4 +- .../net/yacy/document/parser/ooxmlParser.java | 4 +- .../net/yacy/document/parser/pptParser.java | 38 +++++++++---------- .../document/parser/xml/ODMetaHandler.java | 27 ++++++++++++- 5 files changed, 70 insertions(+), 41 deletions(-) diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index a33844382..f6a9af827 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -29,7 +29,6 @@ package net.yacy.document.parser; import java.io.InputStream; import java.util.ArrayList; -import java.util.Date; import java.util.List; import net.yacy.cora.document.id.AnchorURL; @@ -110,24 +109,25 @@ public class docParser extends AbstractParser implements Parser { Document[] docs; docs = new Document[]{new Document( - location, - mimeType, - "UTF-8", - this, - null, - keywlist, - singleList(title), - extractor.getSummaryInformation().getAuthor(), // constuctor can handle null - extractor.getDocSummaryInformation().getCompany(), // publisher - null, - descriptions, - 0.0f, 0.0f, - contents.toString(), - null, - null, - null, - false, - new Date())}; + location, + mimeType, + "UTF-8", + this, + null, + keywlist, + singleList(title), + extractor.getSummaryInformation().getAuthor(), // constuctor can handle null + extractor.getDocSummaryInformation().getCompany(), // publisher + null, + descriptions, + 0.0f, 0.0f, + contents.toString(), + null, + null, + null, + false, + extractor.getSummaryInformation().getLastSaveDateTime() // maybe null + )}; return docs; } diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 2f574f0c0..859f308fe 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -120,6 +120,7 @@ public class odtParser extends AbstractParser implements Parser { String docLongTitle = null; String docAuthor = null; String docLanguage = null; + Date docModified = null; // opening the file as zip file final ZipFile zipFile = new ZipFile(dest); @@ -160,6 +161,7 @@ public class odtParser extends AbstractParser implements Parser { docLongTitle = metaData.getSubject(); docAuthor = metaData.getCreator(); docLanguage = metaData.getLanguage(); + docModified = metaData.getLastModified(); // maybe null } } @@ -201,7 +203,7 @@ public class odtParser extends AbstractParser implements Parser { null, null, false, - new Date() + docModified )}; return docs; } catch (final Exception e) { diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 9072938f4..0da5b725b 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -102,6 +102,7 @@ public class ooxmlParser extends AbstractParser implements Parser { String docLongTitle = null; String docAuthor = null; String docLanguage = null; + Date docModified = null; // opening the file as zip file final ZipFile zipFile= new ZipFile(dest); @@ -145,6 +146,7 @@ public class ooxmlParser extends AbstractParser implements Parser { docLongTitle = metaData.getSubject(); docAuthor = metaData.getCreator(); docLanguage = metaData.getLanguage(); + docModified = metaData.getLastModified(); } } @@ -185,7 +187,7 @@ public class ooxmlParser extends AbstractParser implements Parser { null, null, false, - new Date())}; + docModified)}; return docs; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index f05cf8dec..b41ff3eac 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -30,7 +30,6 @@ package net.yacy.document.parser; import java.io.BufferedInputStream; import java.io.InputStream; import java.util.ArrayList; -import java.util.Date; import java.util.List; import net.yacy.cora.document.id.AnchorURL; @@ -103,24 +102,25 @@ public class pptParser extends AbstractParser implements Parser { * and set shortText and bodyText properly */ final Document[] docs = new Document[]{new Document( - location, - mimeType, - "UTF-8", - this, - null, - keywlist, - singleList(title), - pptExtractor.getSummaryInformation().getAuthor(), // may be null - pptExtractor.getDocSummaryInformation().getCompany(), - null, - descriptions, - 0.0f, 0.0f, - contents, - null, - null, - null, - false, - new Date())}; + location, + mimeType, + "UTF-8", + this, + null, + keywlist, + singleList(title), + pptExtractor.getSummaryInformation().getAuthor(), // may be null + pptExtractor.getDocSummaryInformation().getCompany(), + null, + descriptions, + 0.0f, 0.0f, + contents, + null, + null, + null, + false, + pptExtractor.getSummaryInformation().getLastSaveDateTime() // may be null + )}; return docs; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/net/yacy/document/parser/xml/ODMetaHandler.java b/source/net/yacy/document/parser/xml/ODMetaHandler.java index b068548c4..8a9a6bee8 100644 --- a/source/net/yacy/document/parser/xml/ODMetaHandler.java +++ b/source/net/yacy/document/parser/xml/ODMetaHandler.java @@ -26,6 +26,9 @@ package net.yacy.document.parser.xml; +import java.text.ParseException; +import java.util.Date; +import net.yacy.cora.date.ISO8601Formatter; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; @@ -39,6 +42,7 @@ public class ODMetaHandler extends DefaultHandler { private String docSubject = null; private String docTitle = null; private String docDescription = null; + private String docLastmodified = null; public ODMetaHandler() { } @@ -67,7 +71,9 @@ public class ODMetaHandler extends DefaultHandler { this.docTitle = buffer.toString(); } else if ("dc:description".equals(tag)) { this.docDescription = buffer.toString(); - } + } else if ("dcterms:modified".equals(tag) || "dc:date".equals(tag)) { // Microsoft uses , OpenOffice + this.docLastmodified = buffer.toString(); + } } public String getCreator() { @@ -89,5 +95,24 @@ public class ODMetaHandler extends DefaultHandler { public String getDescription() { return docDescription; } + + /** + * get the last modification date of the document + * + * @return date or null + */ + public Date getLastModified() { + Date d; + if (docLastmodified != null && !docLastmodified.isEmpty()) { + try { + d = ISO8601Formatter.FORMATTER.parse(this.docLastmodified, 0).getTime(); + } catch (ParseException ex) { + d = null; + } + } else { + d = null; + } + return d; + } } From 41c4eade51e67fc38ef8b38f641def79377dcb95 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 6 Sep 2015 04:28:27 +0200 Subject: [PATCH 4/6] extract modification date from vCard (vcfParser) --- source/net/yacy/document/parser/vcfParser.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index f4c4120e2..0676153f5 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -32,6 +32,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; +import java.text.ParseException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; @@ -39,6 +40,7 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.order.Base64Order; @@ -81,6 +83,7 @@ public class vcfParser extends AbstractParser implements Parser { final HashMap parsedData = new HashMap(); final List anchors = new ArrayList(); final LinkedList parsedNames = new LinkedList(); + Date revDate = null; // rev=modified date boolean useLastLine = false; int lineNr = 0; @@ -183,6 +186,13 @@ public class vcfParser extends AbstractParser implements Parser { parsedDataText.append(value).append("\r\n"); } parsedDataText.append("\r\n"); + // get specific meta data from parsed key-value + value = parsedData.get("REV"); // modified date + if (value != null && !value.isEmpty()) { + try { + revDate = ISO8601Formatter.FORMATTER.parse(value, 0).getTime(); + } catch(ParseException ex){ } + } parsedData.clear(); } else if (key.toUpperCase().startsWith("URL")) { try { @@ -235,7 +245,7 @@ public class vcfParser extends AbstractParser implements Parser { null, null, // a treeset of image URLs false, - new Date())}; + revDate)}; // modified date } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; From e37a4f0b3da605e68b5256b244ea1aa5bf198a68 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 6 Sep 2015 22:19:05 +0200 Subject: [PATCH 5/6] prevent metadata records in index w/o valid url by throwing MalformedURL exception on URIMetadataNode creation --- .../AbstractFederateSearchConnector.java | 6 ++-- .../federate/SolrFederateSearchConnector.java | 7 ++-- .../kelondro/data/meta/URIMetadataNode.java | 33 ++++--------------- source/net/yacy/peers/Protocol.java | 8 ++++- source/net/yacy/search/Switchboard.java | 10 ++++-- 5 files changed, 29 insertions(+), 35 deletions(-) diff --git a/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java index b9e7c297a..932b128ca 100644 --- a/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java +++ b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java @@ -21,6 +21,7 @@ package net.yacy.cora.federate; import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -143,7 +144,7 @@ abstract public class AbstractFederateSearchConnector implements FederateSearchC * @param remote result (with remote fieldnames) * @return SolrDocument with field names according to the YaCy schema */ - protected URIMetadataNode toYaCySchema(final SolrDocument doc) { + protected URIMetadataNode toYaCySchema(final SolrDocument doc) throws MalformedURLException { // set YaCy id String urlstr; if (localcfg.contains("sku")) { @@ -156,7 +157,8 @@ abstract public class AbstractFederateSearchConnector implements FederateSearchC } } - URIMetadataNode newdoc = new URIMetadataNode(urlstr); + final DigestURL url = new DigestURL(urlstr); + URIMetadataNode newdoc = new URIMetadataNode(url); Iterator it = localcfg.entryIterator(); while (it.hasNext()) { Configuration.Entry et = it.next(); diff --git a/source/net/yacy/cora/federate/SolrFederateSearchConnector.java b/source/net/yacy/cora/federate/SolrFederateSearchConnector.java index 7e9fceaaa..1a134a0fb 100644 --- a/source/net/yacy/cora/federate/SolrFederateSearchConnector.java +++ b/source/net/yacy/cora/federate/SolrFederateSearchConnector.java @@ -20,6 +20,7 @@ package net.yacy.cora.federate; import java.io.IOException; +import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -101,8 +102,10 @@ public class SolrFederateSearchConnector extends AbstractFederateSearchConnector SolrDocumentList docList = solrConnector.getDocumentListByParams(msp); // convert to YaCy schema documentlist for (SolrDocument doc : docList) { - URIMetadataNode anew = toYaCySchema(doc); - docs.add(anew); + try { + URIMetadataNode anew = toYaCySchema(doc); + docs.add(anew); + } catch (MalformedURLException ex) { } } } catch (IOException | SolrException e) { } finally { diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 511e571ee..928ae92d2 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -90,18 +90,13 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable Date: Mon, 7 Sep 2015 02:36:22 +0200 Subject: [PATCH 6/6] improve filtering by filetype navigator. The used url-filter for filetype doesn't require ".ext" resulting in too many matches, add a sort-out filter for RWI results. --- source/net/yacy/search/query/QueryParams.java | 2 +- source/net/yacy/search/query/SearchEvent.java | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 74b2697ee..fea69f626 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -205,7 +205,7 @@ public final class QueryParams { String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol; String defaulthostprefix = modifier.protocol == null ? "www" : modifier.protocol; String hostfilter = modifier.sitehost == null && tld == null ? ".*" : modifier.sitehost == null ? ".*\\." + tld : modifier.sitehost.startsWith(defaulthostprefix + ".") ? "(" + defaulthostprefix + "\\.)?" + modifier.sitehost.substring(4) : "(" + defaulthostprefix + "\\.)?" + modifier.sitehost; - String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*"; + String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*"; // TODO: should be ".ext" but while/comment above suggests not -> add filetype contrain pullOneFilteredFromRWI() String filter = protocolfilter + "..." + hostfilter + "." + filefilter; if (!filter.equals(".*....*..*")) { Pattern r = Pattern.compile("(\\.|(\\.\\*))\\.\\*"); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index b2df72d37..ab2ab70b1 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1159,6 +1159,13 @@ public final class SearchEvent { continue; } + // check modifier constraint filetype (using fileextension) + if (this.query.modifier.filetype != null && !this.query.modifier.filetype.equals(ext)) { + if (log.isFine()) log.fine("dropped RWI: file type constraint = " + this.query.modifier.filetype); + if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet(); + continue; + } + // check modifier constraint (language) // TODO: : page.language() never null but defaults to "en" (may cause false drop of result) if (this.query.modifier.language != null && !this.query.modifier.language.equals(page.language())) {