diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index d8fbcd50d..763b47305 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -219,8 +219,8 @@ public class Crawler_p { String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING); String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted - final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start - final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start + boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start + boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start final boolean restrictedcrawl = fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch); final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off")); @@ -261,6 +261,10 @@ public class Crawler_p { if (p >= 8) crawlName = crawlName.substring(0, p); } if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr; + // in case that a root url has a file protocol, then the site filter does not work, patch that: + if (fullDomain) { + for (DigestURL u: rootURLs) if (u.isFile()) {fullDomain = false; subPath = true; break;} + } // delete old robots entries for (DigestURL ru : rootURLs) { diff --git a/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java index b9e7c297a..932b128ca 100644 --- a/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java +++ b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java @@ -21,6 +21,7 @@ package net.yacy.cora.federate; import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -143,7 +144,7 @@ abstract public class AbstractFederateSearchConnector implements FederateSearchC * @param remote result (with remote fieldnames) * @return SolrDocument with field names according to the YaCy schema */ - protected URIMetadataNode toYaCySchema(final SolrDocument doc) { + protected URIMetadataNode toYaCySchema(final SolrDocument doc) throws MalformedURLException { // set YaCy id String urlstr; if (localcfg.contains("sku")) { @@ -156,7 +157,8 @@ abstract public class AbstractFederateSearchConnector implements FederateSearchC } } - URIMetadataNode newdoc = new URIMetadataNode(urlstr); + final DigestURL url = new DigestURL(urlstr); + URIMetadataNode newdoc = new URIMetadataNode(url); Iterator it = localcfg.entryIterator(); while (it.hasNext()) { Configuration.Entry et = it.next(); diff --git a/source/net/yacy/cora/federate/SolrFederateSearchConnector.java b/source/net/yacy/cora/federate/SolrFederateSearchConnector.java index 7e9fceaaa..1a134a0fb 100644 --- a/source/net/yacy/cora/federate/SolrFederateSearchConnector.java +++ b/source/net/yacy/cora/federate/SolrFederateSearchConnector.java @@ -20,6 +20,7 @@ package net.yacy.cora.federate; import java.io.IOException; +import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -101,8 +102,10 @@ public class SolrFederateSearchConnector extends AbstractFederateSearchConnector SolrDocumentList docList = solrConnector.getDocumentListByParams(msp); // convert to YaCy schema documentlist for (SolrDocument doc : docList) { - URIMetadataNode anew = toYaCySchema(doc); - docs.add(anew); + try { + URIMetadataNode anew = toYaCySchema(doc); + docs.add(anew); + } catch (MalformedURLException ex) { } } } catch (IOException | SolrException e) { } finally { diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index a33844382..f6a9af827 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -29,7 +29,6 @@ package net.yacy.document.parser; import java.io.InputStream; import java.util.ArrayList; -import java.util.Date; import java.util.List; import net.yacy.cora.document.id.AnchorURL; @@ -110,24 +109,25 @@ public class docParser extends AbstractParser implements Parser { Document[] docs; docs = new Document[]{new Document( - location, - mimeType, - "UTF-8", - this, - null, - keywlist, - singleList(title), - extractor.getSummaryInformation().getAuthor(), // constuctor can handle null - extractor.getDocSummaryInformation().getCompany(), // publisher - null, - descriptions, - 0.0f, 0.0f, - contents.toString(), - null, - null, - null, - false, - new Date())}; + location, + mimeType, + "UTF-8", + this, + null, + keywlist, + singleList(title), + extractor.getSummaryInformation().getAuthor(), // constuctor can handle null + extractor.getDocSummaryInformation().getCompany(), // publisher + null, + descriptions, + 0.0f, 0.0f, + contents.toString(), + null, + null, + null, + false, + extractor.getSummaryInformation().getLastSaveDateTime() // maybe null + )}; return docs; } diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 2f574f0c0..859f308fe 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -120,6 +120,7 @@ public class odtParser extends AbstractParser implements Parser { String docLongTitle = null; String docAuthor = null; String docLanguage = null; + Date docModified = null; // opening the file as zip file final ZipFile zipFile = new ZipFile(dest); @@ -160,6 +161,7 @@ public class odtParser extends AbstractParser implements Parser { docLongTitle = metaData.getSubject(); docAuthor = metaData.getCreator(); docLanguage = metaData.getLanguage(); + docModified = metaData.getLastModified(); // maybe null } } @@ -201,7 +203,7 @@ public class odtParser extends AbstractParser implements Parser { null, null, false, - new Date() + docModified )}; return docs; } catch (final Exception e) { diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 9072938f4..0da5b725b 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -102,6 +102,7 @@ public class ooxmlParser extends AbstractParser implements Parser { String docLongTitle = null; String docAuthor = null; String docLanguage = null; + Date docModified = null; // opening the file as zip file final ZipFile zipFile= new ZipFile(dest); @@ -145,6 +146,7 @@ public class ooxmlParser extends AbstractParser implements Parser { docLongTitle = metaData.getSubject(); docAuthor = metaData.getCreator(); docLanguage = metaData.getLanguage(); + docModified = metaData.getLastModified(); } } @@ -185,7 +187,7 @@ public class ooxmlParser extends AbstractParser implements Parser { null, null, false, - new Date())}; + docModified)}; return docs; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index f05cf8dec..b41ff3eac 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -30,7 +30,6 @@ package net.yacy.document.parser; import java.io.BufferedInputStream; import java.io.InputStream; import java.util.ArrayList; -import java.util.Date; import java.util.List; import net.yacy.cora.document.id.AnchorURL; @@ -103,24 +102,25 @@ public class pptParser extends AbstractParser implements Parser { * and set shortText and bodyText properly */ final Document[] docs = new Document[]{new Document( - location, - mimeType, - "UTF-8", - this, - null, - keywlist, - singleList(title), - pptExtractor.getSummaryInformation().getAuthor(), // may be null - pptExtractor.getDocSummaryInformation().getCompany(), - null, - descriptions, - 0.0f, 0.0f, - contents, - null, - null, - null, - false, - new Date())}; + location, + mimeType, + "UTF-8", + this, + null, + keywlist, + singleList(title), + pptExtractor.getSummaryInformation().getAuthor(), // may be null + pptExtractor.getDocSummaryInformation().getCompany(), + null, + descriptions, + 0.0f, 0.0f, + contents, + null, + null, + null, + false, + pptExtractor.getSummaryInformation().getLastSaveDateTime() // may be null + )}; return docs; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index f4c4120e2..0676153f5 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -32,6 +32,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; +import java.text.ParseException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; @@ -39,6 +40,7 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.order.Base64Order; @@ -81,6 +83,7 @@ public class vcfParser extends AbstractParser implements Parser { final HashMap parsedData = new HashMap(); final List anchors = new ArrayList(); final LinkedList parsedNames = new LinkedList(); + Date revDate = null; // rev=modified date boolean useLastLine = false; int lineNr = 0; @@ -183,6 +186,13 @@ public class vcfParser extends AbstractParser implements Parser { parsedDataText.append(value).append("\r\n"); } parsedDataText.append("\r\n"); + // get specific meta data from parsed key-value + value = parsedData.get("REV"); // modified date + if (value != null && !value.isEmpty()) { + try { + revDate = ISO8601Formatter.FORMATTER.parse(value, 0).getTime(); + } catch(ParseException ex){ } + } parsedData.clear(); } else if (key.toUpperCase().startsWith("URL")) { try { @@ -235,7 +245,7 @@ public class vcfParser extends AbstractParser implements Parser { null, null, // a treeset of image URLs false, - new Date())}; + revDate)}; // modified date } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; diff --git a/source/net/yacy/document/parser/xml/ODMetaHandler.java b/source/net/yacy/document/parser/xml/ODMetaHandler.java index b068548c4..8a9a6bee8 100644 --- a/source/net/yacy/document/parser/xml/ODMetaHandler.java +++ b/source/net/yacy/document/parser/xml/ODMetaHandler.java @@ -26,6 +26,9 @@ package net.yacy.document.parser.xml; +import java.text.ParseException; +import java.util.Date; +import net.yacy.cora.date.ISO8601Formatter; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; @@ -39,6 +42,7 @@ public class ODMetaHandler extends DefaultHandler { private String docSubject = null; private String docTitle = null; private String docDescription = null; + private String docLastmodified = null; public ODMetaHandler() { } @@ -67,7 +71,9 @@ public class ODMetaHandler extends DefaultHandler { this.docTitle = buffer.toString(); } else if ("dc:description".equals(tag)) { this.docDescription = buffer.toString(); - } + } else if ("dcterms:modified".equals(tag) || "dc:date".equals(tag)) { // Microsoft uses , OpenOffice + this.docLastmodified = buffer.toString(); + } } public String getCreator() { @@ -89,5 +95,24 @@ public class ODMetaHandler extends DefaultHandler { public String getDescription() { return docDescription; } + + /** + * get the last modification date of the document + * + * @return date or null + */ + public Date getLastModified() { + Date d; + if (docLastmodified != null && !docLastmodified.isEmpty()) { + try { + d = ISO8601Formatter.FORMATTER.parse(this.docLastmodified, 0).getTime(); + } catch (ParseException ex) { + d = null; + } + } else { + d = null; + } + return d; + } } diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 511e571ee..928ae92d2 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -90,18 +90,13 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable { } // track this table - tableTracker.put(tablefile.toString(), this); + synchronized (tableTracker) {tableTracker.put(tablefile.toString(), this);} } public synchronized void warmUp() { diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index bc7463c90..59198b53f 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -48,6 +48,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.InetAddress; +import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -1089,7 +1090,12 @@ public final class Protocol { if ( doc == null ) { continue; } - URIMetadataNode urlEntry = new URIMetadataNode(doc); + URIMetadataNode urlEntry; + try { + urlEntry = new URIMetadataNode(doc); + } catch (MalformedURLException ex) { + continue; + } if ( blacklist.isListed(BlacklistType.SEARCH, urlEntry.url()) ) { if ( Network.log.isInfo() ) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index a61bc4ae1..ad269ff2c 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -165,6 +165,7 @@ import net.yacy.document.parser.pdfParser; import net.yacy.document.parser.html.Evaluation; import net.yacy.gui.Audio; import net.yacy.gui.Tray; +import net.yacy.http.YaCyHttpServer; import net.yacy.kelondro.blob.BEncodedHeap; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -215,7 +216,6 @@ import net.yacy.visualization.CircleTool; import com.google.common.io.Files; -import net.yacy.http.YaCyHttpServer; public final class Switchboard extends serverSwitch { @@ -2998,8 +2998,12 @@ public final class Switchboard extends serverSwitch { final Seed initiatorPeer = this.peers.getConnected(queueEntry.initiator()); if ( initiatorPeer != null ) { // start a thread for receipt sending to avoid a blocking here - SolrDocument sd = this.index.fulltext().getDefaultConfiguration().toSolrDocument(newEntry); - new Thread(new receiptSending(initiatorPeer, new URIMetadataNode(sd)), "sending receipt to " + ASCII.String(queueEntry.initiator())).start(); + try { + SolrDocument sd = this.index.fulltext().getDefaultConfiguration().toSolrDocument(newEntry); + new Thread(new receiptSending(initiatorPeer, new URIMetadataNode(sd)), "sending receipt to " + ASCII.String(queueEntry.initiator())).start(); + } catch (MalformedURLException ex) { + this.log.info("malformed url: "+ex.getMessage()); + } } } } diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 74b2697ee..fea69f626 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -205,7 +205,7 @@ public final class QueryParams { String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol; String defaulthostprefix = modifier.protocol == null ? "www" : modifier.protocol; String hostfilter = modifier.sitehost == null && tld == null ? ".*" : modifier.sitehost == null ? ".*\\." + tld : modifier.sitehost.startsWith(defaulthostprefix + ".") ? "(" + defaulthostprefix + "\\.)?" + modifier.sitehost.substring(4) : "(" + defaulthostprefix + "\\.)?" + modifier.sitehost; - String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*"; + String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*"; // TODO: should be ".ext" but while/comment above suggests not -> add filetype contrain pullOneFilteredFromRWI() String filter = protocolfilter + "..." + hostfilter + "." + filefilter; if (!filter.equals(".*....*..*")) { Pattern r = Pattern.compile("(\\.|(\\.\\*))\\.\\*"); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index b2df72d37..ab2ab70b1 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1159,6 +1159,13 @@ public final class SearchEvent { continue; } + // check modifier constraint filetype (using fileextension) + if (this.query.modifier.filetype != null && !this.query.modifier.filetype.equals(ext)) { + if (log.isFine()) log.fine("dropped RWI: file type constraint = " + this.query.modifier.filetype); + if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet(); + continue; + } + // check modifier constraint (language) // TODO: : page.language() never null but defaults to "en" (may cause false drop of result) if (this.query.modifier.language != null && !this.query.modifier.language.equals(page.language())) {