From fed26f33a85658d176f9a38227316a7f5b34fa0a Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 15 Apr 2015 13:17:23 +0200 Subject: [PATCH] enhanced timezone managament for indexed data: to support the new time parser and search functions in YaCy a high precision detection of date and time on the day is necessary. That requires that the time zone of the document content and the time zone of the user, doing a search, is detected. The time zone of the search request is done automatically using the browsers time zone offset which is delivered to the search request automatically and invisible to the user. The time zone for the content of web pages cannot be detected automatically and must be an attribute of crawl starts. The advanced crawl start now provides an input field to set the time zone in minutes as an offset number. All parsers must get a time zone offset passed, so this required the change of the parser java api. A lot of other changes had been made which corrects the wrong handling of dates in YaCy which was to add a correction based on the time zone of the server. Now no correction is added and all dates in YaCy are UTC/GMT time zone, a normalized time zone for all peers. --- htroot/CrawlStartExpert.html | 13 +++- htroot/CrawlStartSite.html | 1 + htroot/Crawler_p.java | 9 ++- htroot/HostBrowser.java | 3 +- htroot/IndexControlRWIs_p.java | 3 +- htroot/NetworkHistory.java | 2 +- htroot/QuickCrawlLink_p.java | 9 ++- htroot/api/bookmarks/posts/get.java | 2 +- htroot/api/push_p.java | 3 +- htroot/api/timeline_p.java | 4 +- htroot/index.html | 1 + htroot/rct_p.java | 3 +- htroot/yacy/search.java | 5 +- htroot/yacy/transferURL.java | 2 +- htroot/yacysearch.html | 1 + htroot/yacysearch.java | 6 +- htroot/yacysearchtrailer.java | 4 +- .../net/yacy/cora/date/AbstractFormatter.java | 12 +++- source/net/yacy/cora/date/DateFormatter.java | 3 +- .../net/yacy/cora/date/GenericFormatter.java | 67 +++++++++---------- .../net/yacy/cora/date/ISO8601Formatter.java | 15 ++--- .../yacy/cora/document/feed/RSSMessage.java | 2 +- .../cora/federate/FederateSearchManager.java | 3 +- source/net/yacy/crawler/CrawlStacker.java | 35 +++++++--- source/net/yacy/crawler/CrawlSwitchboard.java | 27 +++++--- .../net/yacy/crawler/data/CrawlProfile.java | 18 ++++- source/net/yacy/crawler/data/CrawlQueues.java | 3 +- source/net/yacy/crawler/data/Snapshots.java | 4 +- .../net/yacy/crawler/retrieval/Request.java | 14 +++- .../net/yacy/crawler/retrieval/Response.java | 29 ++++---- .../crawler/retrieval/SitemapImporter.java | 3 +- source/net/yacy/data/BlogBoard.java | 4 +- source/net/yacy/data/BookmarkHelper.java | 4 +- .../net/yacy/data/ymark/YMarkAutoTagger.java | 2 +- .../net/yacy/data/ymark/YMarkCrawlStart.java | 5 +- source/net/yacy/document/Condenser.java | 5 +- source/net/yacy/document/DateDetection.java | 8 +-- source/net/yacy/document/Parser.java | 1 + source/net/yacy/document/TextParser.java | 17 +++-- source/net/yacy/document/content/DCEntry.java | 2 +- .../document/importer/MediawikiImporter.java | 2 +- .../document/importer/ResumptionToken.java | 2 +- .../net/yacy/document/parser/apkParser.java | 8 ++- .../yacy/document/parser/audioTagParser.java | 9 ++- .../parser/augment/AugmentParser.java | 12 +++- .../net/yacy/document/parser/bzipParser.java | 11 ++- .../net/yacy/document/parser/csvParser.java | 8 ++- .../net/yacy/document/parser/docParser.java | 9 ++- .../net/yacy/document/parser/dwgParser.java | 8 ++- .../yacy/document/parser/genericParser.java | 9 ++- .../net/yacy/document/parser/gzipParser.java | 10 ++- .../document/parser/html/ContentScraper.java | 24 ++++--- .../parser/html/ScraperInputStream.java | 5 +- .../net/yacy/document/parser/htmlParser.java | 17 +++-- .../parser/images/genericImageParser.java | 14 ++-- .../parser/images/metadataImageParser.java | 10 +-- .../document/parser/linkScraperParser.java | 11 ++- source/net/yacy/document/parser/mmParser.java | 9 ++- .../net/yacy/document/parser/odtParser.java | 8 ++- .../net/yacy/document/parser/ooxmlParser.java | 8 ++- .../net/yacy/document/parser/pdfParser.java | 10 ++- .../net/yacy/document/parser/pptParser.java | 9 ++- source/net/yacy/document/parser/psParser.java | 9 ++- .../net/yacy/document/parser/rdfParser.java | 11 ++- .../document/parser/rdfa/impl/RDFaParser.java | 24 +++++-- .../net/yacy/document/parser/rssParser.java | 11 ++- .../net/yacy/document/parser/rtfParser.java | 9 ++- .../yacy/document/parser/sevenzipParser.java | 40 ++++++++--- .../yacy/document/parser/sidAudioParser.java | 9 ++- .../yacy/document/parser/sitemapParser.java | 15 +++-- .../net/yacy/document/parser/swfParser.java | 9 ++- .../net/yacy/document/parser/tarParser.java | 14 ++-- .../yacy/document/parser/torrentParser.java | 12 +++- .../net/yacy/document/parser/vcfParser.java | 14 ++-- .../net/yacy/document/parser/vsdParser.java | 8 ++- .../net/yacy/document/parser/xlsParser.java | 9 ++- .../net/yacy/document/parser/zipParser.java | 15 +++-- source/net/yacy/http/ProxyCacheHandler.java | 3 +- source/net/yacy/http/ProxyHandler.java | 3 +- .../yacy/http/servlets/SolrSelectServlet.java | 2 +- source/net/yacy/kelondro/blob/ArrayStack.java | 8 +-- .../yacy/kelondro/blob/BEncodedHeapBag.java | 4 +- source/net/yacy/kelondro/blob/Tables.java | 2 +- .../kelondro/data/meta/URIMetadataNode.java | 6 +- .../net/yacy/kelondro/table/SplitTable.java | 4 +- source/net/yacy/peers/NewsDB.java | 32 +++++++-- source/net/yacy/peers/Seed.java | 4 +- .../peers/graphics/WebStructureGraph.java | 2 +- .../net/yacy/repository/LoaderDispatcher.java | 25 ++++--- source/net/yacy/search/EventTracker.java | 4 +- source/net/yacy/search/Switchboard.java | 31 ++++++--- .../net/yacy/search/index/DocumentIndex.java | 23 +++++-- source/net/yacy/search/index/Segment.java | 2 +- .../net/yacy/search/query/AccessTracker.java | 6 +- .../net/yacy/search/query/QueryModifier.java | 22 +++--- source/net/yacy/search/query/QueryParams.java | 12 ++-- .../yacy/server/http/HTTPDProxyHandler.java | 7 +- 97 files changed, 659 insertions(+), 319 deletions(-) diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index a68125815..90b83ef59 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -513,7 +513,7 @@
- Index Administration + Index Attributes
Indexing
@@ -561,6 +561,17 @@
+
+
+ info + The time zone is required when the parser detects a date in the crawled web page. Content can be searched with the on: - modifier which + requires also a time zone when a query is made. To normalize all given dates, the date is stored in UTC time zone. To get the right offset + from dates without time zones to UTC, this offset must be given here. The offset is given in minutes; + Time zone offsets for locations east of UTC must be negative; offsets for zones west of UTC must be positve. + + +
+
diff --git a/htroot/CrawlStartSite.html b/htroot/CrawlStartSite.html index dddbc4ff2..8127e7770 100644 --- a/htroot/CrawlStartSite.html +++ b/htroot/CrawlStartSite.html @@ -91,6 +91,7 @@ + diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 9a6e786de..8b0e39801 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -470,6 +470,8 @@ public class Crawler_p { } } + int timezoneOffset = post.getInt("timezoneOffset", 0); + // prepare a new crawling profile final CrawlProfile profile; byte[] handle; @@ -502,7 +504,8 @@ public class Crawler_p { cachePolicy, collection, agentName, - new VocabularyScraper(vocabulary_scraper)); + new VocabularyScraper(vocabulary_scraper), + timezoneOffset); handle = ASCII.getBytes(profile.handle()); // before we fire up a new crawl, we make sure that another crawl with the same name is not running @@ -585,7 +588,7 @@ public class Crawler_p { try { // check if the crawl filter works correctly Pattern.compile(newcrawlingMustMatch); - final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper()); + final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset); final Writer writer = new TransformerWriter(null, null, scraper, null, false); if (crawlingFile != null && crawlingFile.exists()) { FileUtils.copy(new FileInputStream(crawlingFile), writer); @@ -605,7 +608,7 @@ public class Crawler_p { } sb.crawler.putActive(handle, profile); - sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks); + sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, profile.timezoneOffset()); } catch (final PatternSyntaxException e) { prop.put("info", "4"); // crawlfilter does not match url prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 61629bb50..bb63d90a3 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -161,7 +161,8 @@ public class HostBrowser { sb.peers.mySeed().hash.getBytes(), url, null, load, new Date(), sb.crawler.defaultProxyProfile.handle(), - 0 + 0, + sb.crawler.defaultProxyProfile.timezoneOffset() )); prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString)); if (wait) waitloop: for (int i = 0; i < 30; i++) { diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 98c8c317b..a7e13a0b8 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -637,11 +637,12 @@ public class IndexControlRWIs_p { final QueryGoal qg = new QueryGoal(queryhashes, null); final QueryParams query = new QueryParams( qg, - new QueryModifier(), + new QueryModifier(0), Integer.MAX_VALUE, "", ContentDomain.ALL, "", //lang + 0, //timezoneOffset null, CacheStrategy.IFFRESH, 1000, 0, //count, offset diff --git a/htroot/NetworkHistory.java b/htroot/NetworkHistory.java index ef7c329df..cc723ef89 100644 --- a/htroot/NetworkHistory.java +++ b/htroot/NetworkHistory.java @@ -74,7 +74,7 @@ public class NetworkHistory { while (rowi.hasNext()) { Row row = rowi.next(); String d = ASCII.String(row.getPK()); - Date date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d); + Date date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d, 0).getTime(); if (date.getTime() < timelimit) break; statrow = new HashMap<>(); for (String key: columns) { diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 106b10151..2b0b599b8 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -128,7 +128,8 @@ public class QuickCrawlLink_p { final byte[] urlhash = crawlingStartURL.hash(); indexSegment.fulltext().remove(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash); - + int timezoneOffset = post.getInt("timezoneOffset", 0); + // create crawling profile CrawlProfile pe = null; try { @@ -156,7 +157,8 @@ public class QuickCrawlLink_p { CacheStrategy.IFFRESH, collection, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + timezoneOffset); sb.crawler.putActive(pe.handle().getBytes(), pe); } catch (final Exception e) { // mist @@ -175,7 +177,8 @@ public class QuickCrawlLink_p { (title==null)?"CRAWLING-ROOT":title, new Date(), pe.handle(), - 0 + 0, + pe.timezoneOffset() )); // validate rejection reason diff --git a/htroot/api/bookmarks/posts/get.java b/htroot/api/bookmarks/posts/get.java index f95cd391c..fabc9b38b 100644 --- a/htroot/api/bookmarks/posts/get.java +++ b/htroot/api/bookmarks/posts/get.java @@ -39,7 +39,7 @@ public class get { Date parsedDate = null; try { - parsedDate = ISO8601Formatter.FORMATTER.parse(date); + parsedDate = ISO8601Formatter.FORMATTER.parse(date, 0).getTime(); } catch (final ParseException e) { parsedDate = new Date(); } diff --git a/htroot/api/push_p.java b/htroot/api/push_p.java index a78e1d776..84689af62 100644 --- a/htroot/api/push_p.java +++ b/htroot/api/push_p.java @@ -103,7 +103,8 @@ public class push_p { "", // the name of the document to crawl new Date(), // current date profile.handle(), // the name of the prefetch profile. This must not be null! - 0); // forkfactor sum of anchors of all ancestors + 0, // forkfactor sum of anchors of all ancestors + profile.timezoneOffset()); Response response = new Response( request, requestHeader, diff --git a/htroot/api/timeline_p.java b/htroot/api/timeline_p.java index 9a129edbc..b9e4991b0 100644 --- a/htroot/api/timeline_p.java +++ b/htroot/api/timeline_p.java @@ -75,8 +75,8 @@ public final class timeline_p { // get a time period Date fromDate = new Date(0); Date toDate = new Date(); - try {fromDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("from", "20031215182700"));} catch (ParseException e) {} - try {toDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("to", GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date())));} catch (ParseException e) {} + try {fromDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("from", "20031215182700"), 0).getTime();} catch (ParseException e) {} + try {toDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("to", GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date())), 0).getTime();} catch (ParseException e) {} // get latest dump; AccessTracker.dumpLog(); diff --git a/htroot/index.html b/htroot/index.html index b92b46652..ba3a2544e 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -80,6 +80,7 @@ + :: diff --git a/htroot/rct_p.java b/htroot/rct_p.java index e32092485..4fb381ac0 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -78,7 +78,8 @@ public class rct_p { "REMOTE-CRAWLING", loaddate, sb.crawler.defaultRemoteProfile.handle(), - 0)); + 0, + sb.crawler.defaultRemoteProfile.timezoneOffset())); } else { env.getLog().warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); } diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 58dae90da..a5ce1170b 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -118,7 +118,8 @@ public final class search { final String prefer = post.get("prefer", ""); final String contentdom = post.get("contentdom", "all"); final String filter = post.get("filter", ".*"); // a filter on the url - QueryModifier modifier = new QueryModifier(); + final int timezoneOffset = post.getInt("timezoneOffset", 0); + QueryModifier modifier = new QueryModifier(timezoneOffset); modifier.sitehost = post.get("sitehost", ""); if (modifier.sitehost.isEmpty()) modifier.sitehost = null; modifier.sitehash = post.get("sitehash", ""); if (modifier.sitehash.isEmpty()) modifier.sitehash = null; modifier.author = post.get("author", ""); if (modifier.author.isEmpty()) modifier.author = null; @@ -232,6 +233,7 @@ public final class search { prefer, ContentDomain.contentdomParser(contentdom), language, + timezoneOffset, new HashSet(), null, // no snippet computation count, @@ -297,6 +299,7 @@ public final class search { prefer, ContentDomain.contentdomParser(contentdom), language, + timezoneOffset, new HashSet(), null, // no snippet computation count, diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 4b042376f..980bd276a 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -55,7 +55,7 @@ public final class transferURL { public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { final long start = System.currentTimeMillis(); long freshdate = 0; - try {freshdate = GenericFormatter.SHORT_DAY_FORMATTER.parse("20061101").getTime();} catch (final ParseException e1) {} + try {freshdate = GenericFormatter.SHORT_DAY_FORMATTER.parse("20061101", 0).getTime().getTime();} catch (final ParseException e1) {} // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; diff --git a/htroot/yacysearch.html b/htroot/yacysearch.html index afaf443ea..c9ba12167 100644 --- a/htroot/yacysearch.html +++ b/htroot/yacysearch.html @@ -108,6 +108,7 @@ Use the RSS search result format to add static searches to your RSS reader, if y + diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index a27aaf109..8494ab05e 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -214,6 +214,9 @@ public class yacysearch { prop.setOutgoingHeader(outgoingHeader); } + // time zone + int timezoneOffset = post.getInt("timezoneOffset", 0); + // collect search attributes int itemsPerPage = @@ -359,7 +362,7 @@ public class yacysearch { } final RankingProfile ranking = sb.getRanking(); - final QueryModifier modifier = new QueryModifier(); + final QueryModifier modifier = new QueryModifier(timezoneOffset); querystring = modifier.parse(querystring); if (modifier.sitehost != null && modifier.sitehost.length() > 0 && querystring.length() == 0) querystring = "*"; // allow to search for all documents on a host @@ -643,6 +646,7 @@ public class yacysearch { prefermask, contentdom, language, + timezoneOffset, metatags, snippetFetchStrategy, itemsPerPage, diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index f0b445056..a0f39d297 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -390,9 +390,9 @@ public class yacysearchtrailer { navigatorIterator = theSearch.dateNavigator.iterator(); // this iterator is different as it iterates by the key order (which is a date order) int i = 0, pos = 0, neg = 0; long dx = -1; - Date fromconstraint = theSearch.getQuery().modifier.from == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.from); + Date fromconstraint = theSearch.getQuery().modifier.from == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.from, theSearch.getQuery().timezoneOffset); if (fromconstraint == null) fromconstraint = new Date(System.currentTimeMillis() - AbstractFormatter.normalyearMillis); - Date toconstraint = theSearch.getQuery().modifier.to == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.to); + Date toconstraint = theSearch.getQuery().modifier.to == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.to, theSearch.getQuery().timezoneOffset); if (toconstraint == null) toconstraint = new Date(System.currentTimeMillis() + AbstractFormatter.normalyearMillis); while (i < QueryParams.FACETS_DATE_MAXCOUNT && navigatorIterator.hasNext()) { name = navigatorIterator.next().trim(); diff --git a/source/net/yacy/cora/date/AbstractFormatter.java b/source/net/yacy/cora/date/AbstractFormatter.java index 2a54df377..932fae059 100644 --- a/source/net/yacy/cora/date/AbstractFormatter.java +++ b/source/net/yacy/cora/date/AbstractFormatter.java @@ -25,13 +25,19 @@ package net.yacy.cora.date; import java.text.ParseException; +import java.util.Calendar; import java.util.Date; import java.util.TimeZone; public abstract class AbstractFormatter implements DateFormatter { - protected static final TimeZone TZ_GMT = TimeZone.getTimeZone("GMT"); - + public final static Calendar testCalendar = Calendar.getInstance(); // a calendar in the current time zone of the server + public final static Calendar UTCCalendar = Calendar.getInstance(); + public final static TimeZone UTCtimeZone = TimeZone.getTimeZone("UTC"); + static { + UTCCalendar.setTimeZone(UTCtimeZone); + } + // statics public final static long secondMillis = 1000; public final static long minuteMillis = 60 * secondMillis; @@ -45,7 +51,7 @@ public abstract class AbstractFormatter implements DateFormatter { protected String last_format; @Override - public abstract Date parse(String s) throws ParseException; + public abstract Calendar parse(String s, int timezoneOffset) throws ParseException; @Override public abstract String format(final Date date); @Override diff --git a/source/net/yacy/cora/date/DateFormatter.java b/source/net/yacy/cora/date/DateFormatter.java index 0e1e2e787..f929534d1 100644 --- a/source/net/yacy/cora/date/DateFormatter.java +++ b/source/net/yacy/cora/date/DateFormatter.java @@ -25,11 +25,12 @@ package net.yacy.cora.date; import java.text.ParseException; +import java.util.Calendar; import java.util.Date; public interface DateFormatter { - public Date parse(String s) throws ParseException; + public Calendar parse(String s, int timezoneOffset) throws ParseException; public String format(final Date date); public String format(); diff --git a/source/net/yacy/cora/date/GenericFormatter.java b/source/net/yacy/cora/date/GenericFormatter.java index e824f383d..16c6084d2 100644 --- a/source/net/yacy/cora/date/GenericFormatter.java +++ b/source/net/yacy/cora/date/GenericFormatter.java @@ -30,6 +30,7 @@ import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.Locale; +import java.util.TimeZone; import net.yacy.cora.util.NumberTools; @@ -51,14 +52,11 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter public static final SimpleDateFormat FORMAT_ANSIC = new SimpleDateFormat(PATTERN_ANSIC, Locale.US); public static final SimpleDateFormat FORMAT_SIMPLE = new SimpleDateFormat(PATTERN_SIMPLE, Locale.US); - // find out time zone and DST offset - private static Calendar thisCalendar = Calendar.getInstance(); - static { // we want GMT times on the formats as well as they don't support any timezone - FORMAT_SHORT_DAY.setTimeZone(TZ_GMT); - FORMAT_SHORT_SECOND.setTimeZone(TZ_GMT); - FORMAT_SHORT_MILSEC.setTimeZone(TZ_GMT); + FORMAT_SHORT_DAY.setTimeZone(UTCtimeZone); + FORMAT_SHORT_SECOND.setTimeZone(UTCtimeZone); + FORMAT_SHORT_MILSEC.setTimeZone(UTCtimeZone); } public static final long time_second = 1000L; @@ -124,56 +122,55 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter * the String. */ @Override - public Date parse(final String timeString) throws ParseException { + public Calendar parse(final String timeString, final int timezoneOffset) throws ParseException { synchronized (this.dateFormat) { - return this.dateFormat.parse(timeString); + Calendar cal = Calendar.getInstance(UTCtimeZone); + cal.setTime(this.dateFormat.parse(timeString)); + cal.add(Calendar.MINUTE, timezoneOffset); // add a correction; i.e. for UTC+1 -60 minutes is added to patch a time given in UTC+1 to the actual time at UTC + return cal; } } - + /** * Like {@link #parseShortSecond(String)} using additional timezone information provided in an * offset String, like "+0100" for CET. + * @throws ParseException */ - public Date parse(final String timeString, final String UTCOffset) { + public Calendar parse(final String timeString, final String UTCOffset) throws ParseException { // FIXME: This method returns an incorrect date, check callers! // ex: de.anomic.server.serverDate.parseShortSecond("20070101120000", "+0200").toGMTString() // => 1 Jan 2007 13:00:00 GMT - if (timeString == null || timeString.isEmpty()) { return new Date(); } - if (UTCOffset == null || UTCOffset.isEmpty()) { return new Date(); } - try { - synchronized (this.dateFormat) { - return new Date(this.dateFormat.parse(timeString).getTime() - UTCDiff() + UTCDiff(UTCOffset)); - } - } catch (final Throwable e) { - //serverLog.logFinest("parseUniversalDate", e.getMessage() + ", remoteTimeString=[" + remoteTimeString + "]"); - return new Date(); - } + if (timeString == null || timeString.isEmpty()) { return Calendar.getInstance(UTCtimeZone); } + if (UTCOffset == null || UTCOffset.isEmpty()) { return Calendar.getInstance(UTCtimeZone); } + return parse(timeString, UTCDiff(UTCOffset)); } - private static long UTCDiff(final String diffString) { + private static int UTCDiff(final String diffString) { if (diffString.length() != 5) throw new IllegalArgumentException("UTC String malformed (wrong size):" + diffString); boolean ahead = true; if (diffString.length() > 0 && diffString.charAt(0) == '+') ahead = true; else if (diffString.length() > 0 && diffString.charAt(0) == '-') ahead = false; else throw new IllegalArgumentException("UTC String malformed (wrong sign):" + diffString); - final long oh = NumberTools.parseLongDecSubstring(diffString, 1, 3); - final long om = NumberTools.parseLongDecSubstring(diffString, 3); - return ((ahead) ? (long) 1 : (long) -1) * (oh * AbstractFormatter.hourMillis + om * AbstractFormatter.minuteMillis); + final int oh = NumberTools.parseIntDecSubstring(diffString, 1, 3); + final int om = NumberTools.parseIntDecSubstring(diffString, 3); + return (int) ((ahead) ? 1 : -1 * (oh * AbstractFormatter.hourMillis + om * AbstractFormatter.minuteMillis)); } - + + /** + * get the difference of this servers time zone to UTC/GMT in milliseconds + * @return + */ private static long UTCDiff() { // DST_OFFSET is dependent on the time of the Calendar, so it has to be updated // to get the correct current offset - synchronized (thisCalendar) { - thisCalendar.setTimeInMillis(System.currentTimeMillis()); - final long zoneOffsetHours = thisCalendar.get(Calendar.ZONE_OFFSET); - final long DSTOffsetHours = thisCalendar.get(Calendar.DST_OFFSET); + synchronized (testCalendar) { + testCalendar.setTimeInMillis(System.currentTimeMillis()); + final long zoneOffsetHours = testCalendar.get(Calendar.ZONE_OFFSET); + final long DSTOffsetHours = testCalendar.get(Calendar.DST_OFFSET); return zoneOffsetHours + DSTOffsetHours; } } - - private final static DecimalFormat D2 = new DecimalFormat("00"); - + public static String UTCDiffString() { // we express the UTC Difference in 5 digits: // SHHMM @@ -195,11 +192,9 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter return sb.toString(); } - public static long correctedUTCTime() { - return System.currentTimeMillis() - UTCDiff(); - } + private final static DecimalFormat D2 = new DecimalFormat("00"); - public static void main(final String[] args) { + public static void main(String[] args) { System.out.println(UTCDiffString()); } } diff --git a/source/net/yacy/cora/date/ISO8601Formatter.java b/source/net/yacy/cora/date/ISO8601Formatter.java index 27ff6f45f..e57dfbfa6 100644 --- a/source/net/yacy/cora/date/ISO8601Formatter.java +++ b/source/net/yacy/cora/date/ISO8601Formatter.java @@ -41,7 +41,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter private static final SimpleDateFormat FORMAT_ISO8601 = new SimpleDateFormat(PATTERN_ISO8601, Locale.US); static { - FORMAT_ISO8601.setTimeZone(TZ_GMT); + FORMAT_ISO8601.setTimeZone(AbstractFormatter.UTCtimeZone); } public static final ISO8601Formatter FORMATTER = new ISO8601Formatter(); @@ -72,7 +72,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter * @throws ParseException */ @Override - public Date parse(String s) throws ParseException { + public Calendar parse(String s, final int timezoneOffset) throws ParseException { // do some lazy checks here s = s.trim(); while (!s.isEmpty() && s.endsWith("?")) s = s.substring(0, s.length() - 1); // sometimes used if write is not sure about date @@ -87,7 +87,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter while (!s.isEmpty() && s.endsWith("?")) s = s.substring(0, s.length() - 1); // sometimes used if write is not sure about date // no go for exact parsing - final Calendar cal = Calendar.getInstance(TZ_GMT, Locale.US); + final Calendar cal = Calendar.getInstance(AbstractFormatter.UTCtimeZone, Locale.US); cal.clear(); // split 2007-12-19T10:20:30.789+0500 into its parts @@ -103,13 +103,13 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter if (t.nextToken().equals("-")) { cal.set(Calendar.MONTH, Integer.parseInt(t.nextToken()) - 1); } else { - return cal.getTime(); + return cal; } // day if (t.nextToken().equals("-")) { cal.set(Calendar.DAY_OF_MONTH, Integer.parseInt(t.nextToken())); } else { - return cal.getTime(); + return cal; } // The standard says: // if there is an hour there has to be a minute and a timezone token, too. @@ -147,7 +147,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter sign = -1; } else { // no legal TZ offset found - return cal.getTime(); + return cal; } offset = sign * Integer.parseInt(t.nextToken()) * 10 * 3600; } @@ -168,8 +168,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter // in case we couldn't even parse a year if (!cal.isSet(Calendar.YEAR)) throw new ParseException("parseISO8601: Cannot parse '" + s + "'", 0); - Date d = cal.getTime(); - return d; + return cal; } diff --git a/source/net/yacy/cora/document/feed/RSSMessage.java b/source/net/yacy/cora/document/feed/RSSMessage.java index aea58547e..340d01e99 100644 --- a/source/net/yacy/cora/document/feed/RSSMessage.java +++ b/source/net/yacy/cora/document/feed/RSSMessage.java @@ -224,7 +224,7 @@ public class RSSMessage implements Hit, Comparable, Comparator hyperlinks, final boolean replace) { + private void enqueueEntries( + final byte[] initiator, + final String profileHandle, + final List hyperlinks, + final boolean replace, + final int timezoneOffset) { if (replace) { // delete old entries, if exists to force a re-load of the url (thats wanted here) Set hosthashes = new HashSet(); @@ -199,7 +208,7 @@ public final class CrawlStacker { int p = userInfo == null ? -1 : userInfo.indexOf(':'); String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p); String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1); - enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), user, pw, replace); + enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), user, pw, replace, timezoneOffset); } else { // put entry on crawl stack enqueueEntry(new Request( @@ -209,13 +218,22 @@ public final class CrawlStacker { url.getNameProperty(), new Date(), profileHandle, - 0 + 0, + timezoneOffset )); } } } - public void enqueueEntriesFTP(final byte[] initiator, final String profileHandle, final String host, final int port, final String user, final String pw, final boolean replace) { + public void enqueueEntriesFTP( + final byte[] initiator, + final String profileHandle, + final String host, + final int port, + final String user, + final String pw, + final boolean replace, + final int timezoneOffset) { final CrawlQueues cq = this.nextQueue; new Thread() { @Override @@ -248,7 +266,8 @@ public final class CrawlStacker { MultiProtocolURL.unescape(entry.name), entry.date, profileHandle, - 0)); + 0, + timezoneOffset)); } } catch (final IOException e1) { ConcurrentLog.logException(e1); @@ -272,7 +291,7 @@ public final class CrawlStacker { "CRAWLING-ROOT", new Date(), pe.handle(), - 0)); + 0, 0)); } /** diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 4472c59e0..fcce03c4b 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -296,7 +296,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_PROXY, ClientIdentification.yacyProxyAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultProxyProfile.handle()), this.defaultProxyProfile); @@ -327,7 +328,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_REMOTE, ClientIdentification.yacyInternetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); @@ -358,7 +360,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); @@ -389,7 +392,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); @@ -421,7 +425,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, ClientIdentification.browserAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); @@ -452,7 +457,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); @@ -483,7 +489,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); @@ -514,7 +521,8 @@ public final class CrawlSwitchboard { CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_SURROGATE, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); @@ -548,7 +556,8 @@ public final class CrawlSwitchboard { CacheStrategy.NOCACHE, collection, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile); this.defaultPushProfiles.put(collection, genericPushProfile); return genericPushProfile; diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index f90b25a7f..5a87a2f10 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -80,6 +80,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String CACHE_STRAGEGY = "cacheStrategy"; public static final String COLLECTIONS = "collections"; public static final String SCRAPER = "scraper"; + public static final String TIMEZONEOFFSET = "timezoneOffset"; public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch"; public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch"; public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch"; @@ -131,6 +132,9 @@ public class CrawlProfile extends ConcurrentHashMap implements M * @param xpstopw true if parent stop words shall be ignored * @param cacheStrategy determines if and how cache is used loading content * @param collections a comma-separated list of tags which are attached to index entries + * @param userAgentName the profile name of the user agent to be used + * @param scraper a scraper for vocabularies + * @param timezoneOffset the time offset in minutes for scraped dates in text without time zone */ public CrawlProfile( String name, @@ -155,7 +159,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M final CacheStrategy cacheStrategy, final String collections, final String userAgentName, - final VocabularyScraper scraper) { + final VocabularyScraper scraper, + final int timezoneOffset) { super(40); if (name == null || name.isEmpty()) { throw new NullPointerException("name must not be null or empty"); @@ -198,6 +203,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M String jsonString = this.scraper.toString(); assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString; put(SCRAPER, jsonString); + put(TIMEZONEOFFSET, timezoneOffset); } /** @@ -623,6 +629,16 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } + public int timezoneOffset() { + final String timezoneOffset = get(TIMEZONEOFFSET); + if (timezoneOffset == null) return 0; + try { + return Integer.parseInt(timezoneOffset); + } catch (NumberFormatException e) { + return 0; + } + } + /** * get a recrawl date for a given age in minutes * @param oldTimeMinutes diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 62962e045..5a9b0c4a1 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -531,7 +531,8 @@ public class CrawlQueues { item.getDescriptions().size() > 0 ? item.getDescriptions().get(0) : "", loaddate, this.sb.crawler.defaultRemoteProfile.handle(), - 0 + 0, + this.sb.crawler.defaultRemoteProfile.timezoneOffset() )); } else { CrawlQueues.log.warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); diff --git a/source/net/yacy/crawler/data/Snapshots.java b/source/net/yacy/crawler/data/Snapshots.java index 40e5fce30..abf8f981e 100644 --- a/source/net/yacy/crawler/data/Snapshots.java +++ b/source/net/yacy/crawler/data/Snapshots.java @@ -359,10 +359,10 @@ public class Snapshots { private static Date parseDate(String d) { try { - return GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d); + return GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d, 0).getTime(); } catch (ParseException e) { try { - return GenericFormatter.SHORT_DAY_FORMATTER.parse(d); + return GenericFormatter.SHORT_DAY_FORMATTER.parse(d, 0).getTime(); } catch (ParseException ee) { return null; } diff --git a/source/net/yacy/crawler/retrieval/Request.java b/source/net/yacy/crawler/retrieval/Request.java index 81bbaa96f..e02b2fdcb 100644 --- a/source/net/yacy/crawler/retrieval/Request.java +++ b/source/net/yacy/crawler/retrieval/Request.java @@ -92,7 +92,8 @@ public class Request extends WorkflowJob private Bitfield flags; private String statusMessage; private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection - + private int timezoneOffset; + public Request() { // used only to create poison entries this.initiator = null; @@ -106,6 +107,7 @@ public class Request extends WorkflowJob this.statusMessage = null; this.initialHash = 0; this.status = 0; + this.timezoneOffset = 0; } /** @@ -115,7 +117,7 @@ public class Request extends WorkflowJob * @param referrerhash */ public Request(final DigestURL url, final byte[] referrerhash) { - this(null, url, referrerhash, null, null, null, 0); + this(null, url, referrerhash, null, null, null, 0, 0); } /** @@ -136,7 +138,8 @@ public class Request extends WorkflowJob final String name, final Date appdate, final String profileHandle, - final int depth) { + final int depth, + final int timezoneOffset) { // create new entry and store it into database assert url != null; assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle @@ -150,6 +153,7 @@ public class Request extends WorkflowJob this.appdate = (appdate == null) ? 0 : appdate.getTime(); this.profileHandle = profileHandle; // must not be null this.depth = depth; + this.timezoneOffset = timezoneOffset; this.flags = new Bitfield(rowdef.width(10)); this.statusMessage = "loaded(args)"; this.initialHash = url.hashCode(); @@ -271,6 +275,10 @@ public class Request extends WorkflowJob // crawl depth where the url appeared return this.depth; } + + public int timezoneOffset() { + return this.timezoneOffset; + } public String profileHandle() { // the handle of the crawl profile diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 615465199..4e1acb6ef 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -28,7 +28,6 @@ package net.yacy.crawler.retrieval; import java.util.Date; -import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; @@ -260,7 +259,7 @@ public class Response { if (docDate == null) docDate = this.responseHeader.date(); } if (docDate == null && this.request != null) docDate = this.request.appdate(); - if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime()); + if (docDate == null) docDate = new Date(); return docDate; } @@ -372,7 +371,7 @@ public class Response { if (date == null) return "stale_no_date_given_in_response"; try { final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl, 8); // milliseconds to live - if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) { + if (System.currentTimeMillis() - date.getTime() > ttl) { //System.out.println("***not indexed because cache-control"); return "stale_expired"; } @@ -461,8 +460,8 @@ public class Response { if (!this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; } // parse date Date d1, d2; - d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); } - d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); } + d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(); } + d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(); } // finally, we shall treat the cache as stale if the modification time is after the if-.. time if (d2.after(d1)) { return false; } } @@ -501,9 +500,10 @@ public class Response { // -expires in cached response // the expires value gives us a very easy hint when the cache is stale final Date expires = this.responseHeader.expires(); + final Date now = new Date(); if (expires != null) { // System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url); - if (expires.before(new Date(GenericFormatter.correctedUTCTime()))) { return false; } + if (expires.before(now)) { return false; } } final Date lastModified = this.responseHeader.lastModified(); cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL); @@ -517,13 +517,13 @@ public class Response { // file may only be treated as fresh for one more month, not more. Date date = this.responseHeader.date(); if (lastModified != null) { - if (date == null) { date = new Date(GenericFormatter.correctedUTCTime()); } + if (date == null) { date = now; } final long age = date.getTime() - lastModified.getTime(); if (age < 0) { return false; } // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10 // the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime() // therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10 - if (GenericFormatter.correctedUTCTime() - date.getTime() > age / 10) { return false; } + if (now.getTime() - date.getTime() > age / 10) { return false; } } // -cache-control in cached response @@ -542,7 +542,7 @@ public class Response { if (date == null) { return false; } try { final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl, 8); // milliseconds to live - if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) { + if (now.getTime() - date.getTime() > ttl) { return false; } } catch (final Exception e) { @@ -626,12 +626,11 @@ public class Response { // -if-modified-since in request // if the page is fresh at the very moment we can index it final Date ifModifiedSince = this.ifModifiedSince(); + final Date now = new Date(); if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) { // parse date Date d = this.responseHeader.lastModified(); - if (d == null) { - d = new Date(GenericFormatter.correctedUTCTime()); - } + if (d == null) d = now; // finally, we shall treat the cache as stale if the modification time is after the if-.. time if (d.after(ifModifiedSince)) { //System.out.println("***not indexed because if-modified-since"); @@ -655,7 +654,7 @@ public class Response { // sometimes, the expires date is set to the past to prevent that a page is cached // we use that information to see if we should index it final Date expires = this.responseHeader.expires(); - if (expires != null && expires.before(new Date(GenericFormatter.correctedUTCTime()))) { + if (expires != null && expires.before(now)) { return "Stale_(Expired)"; } @@ -688,7 +687,7 @@ public class Response { } try { final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl,8); // milliseconds to live - if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) { + if (now.getTime() - date.getTime() > ttl) { //System.out.println("***not indexed because cache-control"); return "Stale_(expired_by_cache-control)"; } @@ -865,7 +864,7 @@ public class Response { final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); try { - return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.depth(), this.content); + return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content); } catch (final Exception e) { return null; } diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index 240f8239d..b28e13f11 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -108,7 +108,8 @@ public class SitemapImporter extends Thread { entry.url(), entry.lastmod(new Date()), this.crawlingProfile.handle(), - 0 + 0, + this.crawlingProfile.timezoneOffset() )); logger.info("New URL '" + entry.url() + "' added for loading."); } diff --git a/source/net/yacy/data/BlogBoard.java b/source/net/yacy/data/BlogBoard.java index c1ec79f15..f97f7c794 100644 --- a/source/net/yacy/data/BlogBoard.java +++ b/source/net/yacy/data/BlogBoard.java @@ -210,7 +210,7 @@ public class BlogBoard { } try { - date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(StrDate); + date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(StrDate, 0).getTime(); } catch (final ParseException e1) { date = new Date(); } @@ -404,7 +404,7 @@ public class BlogBoard { } return new Date(); } - return GenericFormatter.SHORT_SECOND_FORMATTER.parse(date); + return GenericFormatter.SHORT_SECOND_FORMATTER.parse(date, 0).getTime(); } catch (final ParseException ex) { return new Date(); } diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java index c10c144c1..86f17ad90 100644 --- a/source/net/yacy/data/BookmarkHelper.java +++ b/source/net/yacy/data/BookmarkHelper.java @@ -139,7 +139,7 @@ public class BookmarkHelper { final Set tags=ListManager.string2set(tag); //this allow multiple default tags try { //load the links - final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper()); + final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper(), 0); //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(input,writer); @@ -232,7 +232,7 @@ public class BookmarkHelper { Date parsedDate = null; try { - parsedDate = ISO8601Formatter.FORMATTER.parse(time); + parsedDate = ISO8601Formatter.FORMATTER.parse(time, 0).getTime(); } catch (final ParseException e) { parsedDate = new Date(); } diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index c80ff37a3..df5a2939d 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -87,7 +87,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } //get words from document - final Map words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false).words(); + final Map words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false, 0).words(); // generate potential tags from document title, description and subject final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32; diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index b14c10dc9..562a9703f 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -190,7 +190,8 @@ public class YMarkCrawlStart extends HashMap{ CacheStrategy.IFFRESH, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, - null); // TODO: make this a default profile in CrawlSwitchboard + null, + 0); // TODO: make this a default profile in CrawlSwitchboard sb.crawler.putActive(pe.handle().getBytes(), pe); return sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash.getBytes(), @@ -198,7 +199,7 @@ public class YMarkCrawlStart extends HashMap{ null, "CRAWLING-ROOT", new Date(), - pe.handle(), 0 + pe.handle(), 0, pe.timezoneOffset() )); } } diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 2f7c2ffb5..6cf125d17 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -97,7 +97,8 @@ public final class Condenser { final boolean indexMedia, final WordCache meaningLib, final boolean doAutotagging, - final boolean findDatesInContent + final boolean findDatesInContent, + final int timezoneOffset ) { Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging // if addMedia == true, then all the media links are also parsed and added to the words @@ -123,7 +124,7 @@ public final class Condenser { Map.Entry entry; if (indexText) { String text = document.getTextString(); - if (findDatesInContent) this.dates_in_content = DateDetection.parse(text); + if (findDatesInContent) this.dates_in_content = DateDetection.parse(text, timezoneOffset); createCondensement(document.dc_source(), text, meaningLib, doAutotagging, scraper); // the phrase counter: // phrase 0 are words taken from the URL diff --git a/source/net/yacy/document/DateDetection.java b/source/net/yacy/document/DateDetection.java index 9964aedfd..73662ac56 100644 --- a/source/net/yacy/document/DateDetection.java +++ b/source/net/yacy/document/DateDetection.java @@ -499,7 +499,7 @@ public class DateDetection { * @param text * @return a set of dates, ordered by time. first date in the ordered set is the oldest time. */ - public static LinkedHashSet parse(String text) { + public static LinkedHashSet parse(String text, int timezoneOffset) { Long offset; if ((offset = specialDayOffset.get(text)) != null) { LinkedHashSet dates = new LinkedHashSet<>(); dates.add(new Date((System.currentTimeMillis() / AbstractFormatter.dayMillis) * AbstractFormatter.dayMillis + offset.longValue())); return dates; @@ -513,7 +513,7 @@ public class DateDetection { return dates; } - public static Date parseLine(String text) { + public static Date parseLine(final String text, final int timezoneOffset) { Date d = null; try {d = CONFORM.parse(text);} catch (ParseException e) {} //if (d == null) try {d = GenericFormatter.FORMAT_SHORT_DAY.parse(text);} catch (ParseException e) {} // did not work well and fired for wrong formats; do not use @@ -521,7 +521,7 @@ public class DateDetection { if (d == null) try {d = GenericFormatter.FORMAT_ANSIC.parse(text);} catch (ParseException e) {} if (d == null) { - Set dd = parse(text); + Set dd = parse(text, timezoneOffset); if (dd.size() >= 1) d = dd.iterator().next(); } return d; @@ -601,7 +601,7 @@ public class DateDetection { }; long t = System.currentTimeMillis(); for (String s: test) { - String parsed = parse(fill + " " + s + " " + fill).toString(); + String parsed = parse(fill + " " + s + " " + fill, 0).toString(); System.out.println("SOURCE: " + s); System.out.println("DATE : " + parsed); System.out.println(); diff --git a/source/net/yacy/document/Parser.java b/source/net/yacy/document/Parser.java index be7b49eba..b9139340a 100644 --- a/source/net/yacy/document/Parser.java +++ b/source/net/yacy/document/Parser.java @@ -59,6 +59,7 @@ public interface Parser { String mimeType, String charset, VocabularyScraper scraper, + int timezoneOffset, InputStream source ) throws Parser.Failure, InterruptedException; diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 0898f3c35..191793ca0 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -167,6 +167,7 @@ public final class TextParser { final String mimeType, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final int depth, final File sourceFile ) throws InterruptedException, Parser.Failure { @@ -181,7 +182,7 @@ public final class TextParser { throw new Parser.Failure(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); - docs = parseSource(location, mimeType, charset, scraper, depth, sourceFile.length(), sourceStream); + docs = parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -199,6 +200,7 @@ public final class TextParser { String mimeType, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final int depth, final byte[] content ) throws Parser.Failure { @@ -214,7 +216,7 @@ public final class TextParser { } assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true); - Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, content); + Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content); return docs; } @@ -224,6 +226,7 @@ public final class TextParser { String mimeType, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream @@ -244,7 +247,7 @@ public final class TextParser { // then we use only one stream-oriented parser. if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { // use a specific stream-oriented parser - return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, sourceStream); + return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, timezoneOffset, sourceStream); } // in case that we know more parsers we first transform the content into a byte[] and use that as base @@ -255,7 +258,7 @@ public final class TextParser { } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } - Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, b); + Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b); return docs; } @@ -266,6 +269,7 @@ public final class TextParser { final Parser parser, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final InputStream sourceStream ) throws Parser.Failure { if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream"); @@ -275,7 +279,7 @@ public final class TextParser { if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); try { - final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, sourceStream); + final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream); return docs; } catch (final Exception e) { throw new Parser.Failure("parser failed: " + parser.getName(), location); @@ -288,6 +292,7 @@ public final class TextParser { final Set parsers, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final int depth, final byte[] sourceArray ) throws Parser.Failure { @@ -310,7 +315,7 @@ public final class TextParser { bis = new ByteArrayInputStream(sourceArray); } try { - docs = parser.parse(location, mimeType, documentCharset, scraper, bis); + docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis); } catch (final Parser.Failure e) { failedParser.put(parser, e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index 5c44d3dc7..68dbd095a 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -107,7 +107,7 @@ public class DCEntry extends MultiMapSolrParams { if (d == null) return null; if (d.isEmpty()) return null; try { - Date x = ISO8601Formatter.FORMATTER.parse(d); + Date x = ISO8601Formatter.FORMATTER.parse(d, 0).getTime(); Date now = new Date(); return x.after(now) ? now : x; } catch (final ParseException e) { diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 9e6ba1116..b9557f803 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -524,7 +524,7 @@ public class MediawikiImporter extends Thread implements Importer { public void genDocument() throws Parser.Failure { try { this.url = new AnchorURL(this.urlStub + this.title); - final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 1, UTF8.getBytes(this.html)); + final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html)); this.document = Document.mergeDocuments(this.url, "text/html", parsed); // the wiki parser is not able to find the proper title in the source text, so it must be set here this.document.setTitle(this.title); diff --git a/source/net/yacy/document/importer/ResumptionToken.java b/source/net/yacy/document/importer/ResumptionToken.java index 785c12d26..25075410d 100644 --- a/source/net/yacy/document/importer/ResumptionToken.java +++ b/source/net/yacy/document/importer/ResumptionToken.java @@ -158,7 +158,7 @@ public class ResumptionToken extends TreeMap { final String d = get("expirationDate"); if (d == null) return null; try { - return ISO8601Formatter.FORMATTER.parse(d); + return ISO8601Formatter.FORMATTER.parse(d, 0).getTime(); } catch (final ParseException e) { ConcurrentLog.logException(e); return new Date(); diff --git a/source/net/yacy/document/parser/apkParser.java b/source/net/yacy/document/parser/apkParser.java index 0eacb05f6..6df35f26d 100644 --- a/source/net/yacy/document/parser/apkParser.java +++ b/source/net/yacy/document/parser/apkParser.java @@ -54,7 +54,13 @@ public class apkParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { /* * things to discover: diff --git a/source/net/yacy/document/parser/audioTagParser.java b/source/net/yacy/document/parser/audioTagParser.java index 73195c0a0..ed0a386aa 100644 --- a/source/net/yacy/document/parser/audioTagParser.java +++ b/source/net/yacy/document/parser/audioTagParser.java @@ -70,8 +70,13 @@ public class audioTagParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { String filename = location.getFileName(); diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java index 6b78cf0d3..aa4dcf3df 100644 --- a/source/net/yacy/document/parser/augment/AugmentParser.java +++ b/source/net/yacy/document/parser/augment/AugmentParser.java @@ -38,13 +38,19 @@ public class AugmentParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL url, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { - Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, scraper, source); + Document[] htmlDocs = this.rdfaParser.parse(location, mimeType, charset, scraper, timezoneOffset, source); for (final Document doc : htmlDocs) { /* analyze(doc, url, mimeType, charset); // enrich document text */ - parseAndAugment(doc, url, mimeType, charset); // enrich document with additional tags + parseAndAugment(doc, location, mimeType, charset); // enrich document with additional tags } return htmlDocs; } diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 4d2c9dd6f..4e16fbfce 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -57,8 +57,13 @@ public class bzipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; @@ -95,7 +100,7 @@ public class bzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile); + docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java index 717aadf2b..25bba2fff 100644 --- a/source/net/yacy/document/parser/csvParser.java +++ b/source/net/yacy/document/parser/csvParser.java @@ -53,7 +53,13 @@ public class csvParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { // construct a document using all cells of the document // the first row is used as headline // all lines are artificially terminated by a '.' to separate them as sentence for the condenser. diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 6d3e74fd8..a33844382 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -59,8 +59,13 @@ public class docParser extends AbstractParser implements Parser { @SuppressWarnings("deprecation") @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { final WordExtractor extractor; diff --git a/source/net/yacy/document/parser/dwgParser.java b/source/net/yacy/document/parser/dwgParser.java index 66b902eeb..25c2d29b6 100644 --- a/source/net/yacy/document/parser/dwgParser.java +++ b/source/net/yacy/document/parser/dwgParser.java @@ -61,7 +61,13 @@ public class dwgParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, true)) diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index 53e6e46cb..2ff09475d 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -46,8 +46,13 @@ public class genericParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source1) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { String filename = location.getFileName(); final Document[] docs = new Document[]{new Document( diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 5a57e219a..58f788f37 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -56,7 +56,13 @@ public class gzipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; Document[] docs = null; @@ -80,7 +86,7 @@ public class gzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile); + docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 244dad876..17f9362c7 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -188,6 +188,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private AnchorURL canonical, publisher; private final int maxLinks; private final VocabularyScraper vocabularyScraper; + private final int timezoneOffset; private int breadcrumbs; @@ -213,7 +214,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { * @param classDetector a map from class names to vocabulary names to scrape content from the DOM with associated class name */ @SuppressWarnings("unchecked") - public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper) { + public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); @@ -221,6 +222,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.root = root; this.maxLinks = maxLinks; this.vocabularyScraper = vocabularyScraper; + this.timezoneOffset = timezoneOffset; this.evaluationScores = new Evaluation(); this.rss = new SizeLimitedMap(maxLinks); this.css = new SizeLimitedMap(maxLinks); @@ -389,12 +391,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (content != null) { if ("startDate".equals(itemprop)) try { // parse ISO 8601 date - Date startDate = ISO8601Formatter.FORMATTER.parse(content); + Date startDate = ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime(); this.startDates.add(startDate); } catch (ParseException e) {} if ("endDate".equals(itemprop)) try { // parse ISO 8601 date - Date endDate = ISO8601Formatter.FORMATTER.parse(content); + Date endDate = ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime(); this.endDates.add(endDate); } catch (ParseException e) {} } @@ -651,7 +653,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // start a new scraper to parse links inside this text // parsing the content - final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper); + final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper, this.timezoneOffset); final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false); try { FileUtils.copy(new CharArrayReader(inlineHtml), writer); @@ -1003,19 +1005,19 @@ public class ContentScraper extends AbstractScraper implements Scraper { // content = this.metas.get("date"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} // content = this.metas.get("dc.date"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} // content = this.metas.get("dc:date"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} // content = this.metas.get("last-modified"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} return new Date(); } @@ -1153,19 +1155,19 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } - public static ContentScraper parseResource(final File file, final int maxLinks) throws IOException { + public static ContentScraper parseResource(final File file, final int maxLinks, final int timezoneOffset) throws IOException { // load page final byte[] page = FileUtils.read(file); if (page == null) throw new IOException("no content in file " + file.toString()); // scrape document to look up charset - final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks); + final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks, timezoneOffset); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); htmlFilter.close(); if (charset == null) charset = Charset.defaultCharset().toString(); // scrape content - final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper()); + final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper(), timezoneOffset); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); writer.close(); diff --git a/source/net/yacy/document/parser/html/ScraperInputStream.java b/source/net/yacy/document/parser/html/ScraperInputStream.java index b63a56cc4..ae681f97f 100644 --- a/source/net/yacy/document/parser/html/ScraperInputStream.java +++ b/source/net/yacy/document/parser/html/ScraperInputStream.java @@ -64,13 +64,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener { final DigestURL rooturl, final Transformer transformer, final boolean passbyIfBinarySuspect, - final int maxLinks + final int maxLinks, + final int timezoneOffset ) { // create a input stream for buffereing this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize); this.bufferedIn.mark((int) preBufferSize); - final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper); + final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper, timezoneOffset); scraper.registerHtmlFilterEventListener(this); try { diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index db1cf3a23..654716e63 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -87,13 +87,15 @@ public class htmlParser extends AbstractParser implements Parser { public Document[] parse( final AnchorURL location, final String mimeType, - final String documentCharset, final VocabularyScraper vocscraper, + final String documentCharset, + final VocabularyScraper vocscraper, + final int timezoneOffset, final InputStream sourceStream) throws Parser.Failure, InterruptedException { try { // first get a document from the parsed html Charset[] detectedcharsetcontainer = new Charset[]{null}; - final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, sourceStream, maxLinks); + final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); // parseToScraper also detects/corrects/sets charset from html content tag final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); @@ -151,7 +153,7 @@ public class htmlParser extends AbstractParser implements Parser { return ppd; } - public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, String input, int maxLinks) throws IOException { + public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxLinks) throws IOException { Charset[] detectedcharsetcontainer = new Charset[]{null}; InputStream sourceStream; try { @@ -161,7 +163,7 @@ public class htmlParser extends AbstractParser implements Parser { } ContentScraper scraper; try { - scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, sourceStream, maxLinks); + scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); } catch (Failure e) { throw new IOException(e.getMessage()); } @@ -173,6 +175,7 @@ public class htmlParser extends AbstractParser implements Parser { final String documentCharset, final VocabularyScraper vocabularyScraper, Charset[] detectedcharsetcontainer, + final int timezoneOffset, InputStream sourceStream, final int maxLinks) throws Parser.Failure, IOException { @@ -188,7 +191,7 @@ public class htmlParser extends AbstractParser implements Parser { if (charset == null) { ScraperInputStream htmlFilter = null; try { - htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks); + htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks, timezoneOffset); sourceStream = htmlFilter; charset = htmlFilter.detectCharset(); } catch (final IOException e1) { @@ -222,7 +225,7 @@ public class htmlParser extends AbstractParser implements Parser { } // parsing the content - final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper); + final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); try { FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]); @@ -324,7 +327,7 @@ public class htmlParser extends AbstractParser implements Parser { try { url = new AnchorURL(args[0]); final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null); - final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), new ByteArrayInputStream(content)); + final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), 0, new ByteArrayInputStream(content)); final String title = document[0].dc_title(); System.out.println(title); } catch (final MalformedURLException e) { diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index db08ac783..4f69b7eb6 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -93,8 +93,10 @@ public class genericImageParser extends AbstractParser implements Parser { public Document[] parse( final AnchorURL location, final String mimeType, - final String documentCharset, final VocabularyScraper scraper, - final InputStream sourceStream) throws Parser.Failure, InterruptedException { + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { ImageInfo ii = null; String title = null; @@ -108,7 +110,7 @@ public class genericImageParser extends AbstractParser implements Parser { if (mimeType.equals("image/bmp") || ext.equals("bmp")) { byte[] b; try { - b = FileUtils.read(sourceStream); + b = FileUtils.read(source); } catch (final IOException e) { ConcurrentLog.logException(e); throw new Parser.Failure(e.getMessage(), location); @@ -126,7 +128,7 @@ public class genericImageParser extends AbstractParser implements Parser { // a tutorial is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/sampleUsage.html byte[] b; try { - b = FileUtils.read(sourceStream); + b = FileUtils.read(source); } catch (final IOException e) { ConcurrentLog.logException(e); throw new Parser.Failure(e.getMessage(), location); @@ -182,7 +184,7 @@ public class genericImageParser extends AbstractParser implements Parser { // just ignore } } else { - ii = parseJavaImage(location, sourceStream); + ii = parseJavaImage(location, source); } final HashSet languages = new HashSet(); @@ -315,7 +317,7 @@ public class genericImageParser extends AbstractParser implements Parser { AnchorURL uri; try { uri = new AnchorURL("http://localhost/" + image.getName()); - final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), new FileInputStream(image)); + final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), 0, new FileInputStream(image)); System.out.println(document[0].toString()); } catch (final MalformedURLException e) { e.printStackTrace(); diff --git a/source/net/yacy/document/parser/images/metadataImageParser.java b/source/net/yacy/document/parser/images/metadataImageParser.java index eef448faf..04b20b948 100644 --- a/source/net/yacy/document/parser/images/metadataImageParser.java +++ b/source/net/yacy/document/parser/images/metadataImageParser.java @@ -87,8 +87,10 @@ public class metadataImageParser extends AbstractParser implements Parser { public Document[] parse( final AnchorURL location, final String mimeType, - final String documentCharset, final VocabularyScraper scraper, - final InputStream sourceStream) throws Parser.Failure, InterruptedException { + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { String title = null; String author = null; @@ -99,7 +101,7 @@ public class metadataImageParser extends AbstractParser implements Parser { StringBuilder imgInfotxt = new StringBuilder(); try { - final Metadata metadata = ImageMetadataReader.readMetadata(new BufferedInputStream(sourceStream)); + final Metadata metadata = ImageMetadataReader.readMetadata(new BufferedInputStream(source)); final Iterator directories = metadata.getDirectories().iterator(); final HashMap props = new HashMap(); @@ -160,7 +162,7 @@ public class metadataImageParser extends AbstractParser implements Parser { return new Document[]{new Document( location, mimeType, - documentCharset, + charset, this, new HashSet(0), // languages keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords diff --git a/source/net/yacy/document/parser/linkScraperParser.java b/source/net/yacy/document/parser/linkScraperParser.java index 4c0abbdd4..f0ccbe4d9 100644 --- a/source/net/yacy/document/parser/linkScraperParser.java +++ b/source/net/yacy/document/parser/linkScraperParser.java @@ -59,11 +59,16 @@ public class linkScraperParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("text/sgml"); } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { - Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, source); + Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, timezoneOffset, source); Document htmlParserDoc = htmlParserDocs == null ? null : Document.mergeDocuments(location, mimeType, htmlParserDocs); diff --git a/source/net/yacy/document/parser/mmParser.java b/source/net/yacy/document/parser/mmParser.java index 0781eea3c..686b9cddb 100644 --- a/source/net/yacy/document/parser/mmParser.java +++ b/source/net/yacy/document/parser/mmParser.java @@ -71,8 +71,13 @@ public class mmParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { final StringBuilder sb = new StringBuilder(); diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 588d1432d..2f574f0c0 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -216,7 +216,13 @@ public class odtParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 6535c95ed..9072938f4 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -202,7 +202,13 @@ public class ooxmlParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 52df35bba..1a526a6f5 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -86,7 +86,13 @@ public class pdfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) @@ -376,7 +382,7 @@ public class pdfParser extends AbstractParser implements Parser { final AbstractParser parser = new pdfParser(); Document document = null; try { - document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), new FileInputStream(pdfFile))); + document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), 0, new FileInputStream(pdfFile))); } catch (final Parser.Failure e) { System.err.println("Cannot parse file " + pdfFile.getAbsolutePath()); ConcurrentLog.logException(e); diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index 0f793b0f2..f05cf8dec 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -64,8 +64,13 @@ public class pptParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { try { /* diff --git a/source/net/yacy/document/parser/psParser.java b/source/net/yacy/document/parser/psParser.java index 09cda757e..e25f6439c 100644 --- a/source/net/yacy/document/parser/psParser.java +++ b/source/net/yacy/document/parser/psParser.java @@ -258,8 +258,13 @@ public class psParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; diff --git a/source/net/yacy/document/parser/rdfParser.java b/source/net/yacy/document/parser/rdfParser.java index 6f3b6fee8..dba55415b 100644 --- a/source/net/yacy/document/parser/rdfParser.java +++ b/source/net/yacy/document/parser/rdfParser.java @@ -46,8 +46,13 @@ public class rdfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Failure, InterruptedException { @@ -60,7 +65,7 @@ public class rdfParser extends AbstractParser implements Parser { Document doc; String all = "rdfdatasource"; - doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "", + doc = new Document(location, mimeType, charset, null, null, null, singleList(""), "", "", null, new ArrayList(0), 0, 0, all, null, null, null, false, new Date()); docs.add(doc); diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java index 2a36f962d..f95cca2ae 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java @@ -48,11 +48,16 @@ public class RDFaParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL url, String mimeType, - String charset, final VocabularyScraper scraper, InputStream source) throws Failure, + public Document[] parse( + final AnchorURL url, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Failure, InterruptedException { - Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, source); + Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, timezoneOffset, source); // TODO: current hardcoded restriction: apply rdfa parser only on selected sources. @@ -97,13 +102,18 @@ public class RDFaParser extends AbstractParser implements Parser { return doc; } - private Document[] parseHtml(AnchorURL url, String mimeType, - String charset, VocabularyScraper scraper, InputStream source) throws Failure, + private Document[] parseHtml( + final AnchorURL url, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Failure, InterruptedException { Document[] htmlDocs = null; try { - htmlDocs = this.hp.parse(url, mimeType, charset, scraper, source); + htmlDocs = this.hp.parse(url, mimeType, charset, scraper, timezoneOffset, source); source.reset(); } catch (final IOException e1) { @@ -180,7 +190,7 @@ public class RDFaParser extends AbstractParser implements Parser { if (aReader != null) { RDFaParser aParser = new RDFaParser(); try { - aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), aURL.openStream()); + aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), 0, aURL.openStream()); } catch (final FileNotFoundException e) { e.printStackTrace(); } catch (final IOException e) { diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index f58a14441..7005e85fe 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -59,14 +59,19 @@ public class rssParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL feedurl, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Failure, InterruptedException { RSSReader rssReader; try { rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source); } catch (final IOException e) { - throw new Parser.Failure("Load error:" + e.getMessage(), feedurl, e); + throw new Parser.Failure("Load error:" + e.getMessage(), location, e); } final RSSFeed feed = rssReader.getFeed(); diff --git a/source/net/yacy/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java index 06d7bd5ee..e6ea7d334 100644 --- a/source/net/yacy/document/parser/rtfParser.java +++ b/source/net/yacy/document/parser/rtfParser.java @@ -53,8 +53,13 @@ public class rtfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { try { diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index 5c22533aa..ddfdd8153 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -56,7 +56,12 @@ public class sevenzipParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed"); } - public Document parse(final AnchorURL location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException { + public Document parse( + final AnchorURL location, + final String mimeType, + final String charset, + final int timezoneOffset, + final IInStream source) throws Parser.Failure, InterruptedException { final Document doc = new Document( location, mimeType, @@ -83,7 +88,7 @@ public class sevenzipParser extends AbstractParser implements Parser { } catch (final IOException e) { throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location); } - final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile()); + final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), timezoneOffset); AbstractParser.log.fine("processing archive contents..."); try { archive.Extract(null, -1, 0, aec); @@ -101,16 +106,27 @@ public class sevenzipParser extends AbstractParser implements Parser { } } - public Document parse(final AnchorURL location, final String mimeType, final String charset, final byte[] source) throws Parser.Failure, InterruptedException { - return parse(location, mimeType, charset, new ByteArrayIInStream(source)); + public Document parse( + final AnchorURL location, + final String mimeType, + final String charset, + final int timezoneOffset, + final byte[] source) throws Parser.Failure, InterruptedException { + return parse(location, mimeType, charset, timezoneOffset, new ByteArrayIInStream(source)); } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { try { final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); FileUtils.copy(source, cfos); - return new Document[]{parse(location, mimeType, charset, cfos.toByteArray())}; + return new Document[]{parse(location, mimeType, charset, timezoneOffset, cfos.toByteArray())}; } catch (final IOException e) { throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location); } @@ -124,13 +140,19 @@ public class sevenzipParser extends AbstractParser implements Parser { private ByteArrayOutputStream cfos = null; private final Document doc; private final String prefix; + private final int timezoneOffset; - public SZParserExtractCallback(final ConcurrentLog logger, final IInArchive handler, - final Document doc, final String prefix) { + public SZParserExtractCallback( + final ConcurrentLog logger, + final IInArchive handler, + final Document doc, + final String prefix, + final int timezoneOffset) { super.Init(handler); this.log = logger; this.doc = doc; this.prefix = prefix; + this.timezoneOffset = timezoneOffset; } @Override @@ -172,7 +194,7 @@ public class sevenzipParser extends AbstractParser implements Parser { // below for reversion of the effects final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); - theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), this.doc.getDepth() + 1, this.cfos.toByteArray()); + theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray()); this.doc.addSubDocuments(theDocs); } diff --git a/source/net/yacy/document/parser/sidAudioParser.java b/source/net/yacy/document/parser/sidAudioParser.java index 4f1cbf5c1..1eb216a3b 100644 --- a/source/net/yacy/document/parser/sidAudioParser.java +++ b/source/net/yacy/document/parser/sidAudioParser.java @@ -58,8 +58,13 @@ public class sidAudioParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { try { final int available = source.available(); diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index ecc5eb393..11742179f 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -70,8 +70,13 @@ public class sitemapParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Failure, InterruptedException { final List docs = new ArrayList(); SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent); @@ -83,7 +88,7 @@ public class sitemapParser extends AbstractParser implements Parser { uri = new DigestURL(item.loc); doc = new Document( uri, - TextParser.mimeOf(url), + TextParser.mimeOf(location), charset, this, null, @@ -224,7 +229,7 @@ public class sitemapParser extends AbstractParser implements Parser { public Date lastmod(final Date dflt) { try { - return ISO8601Formatter.FORMATTER.parse(this.lastmod); + return ISO8601Formatter.FORMATTER.parse(this.lastmod, 0).getTime(); } catch (final ParseException e) { return dflt; } @@ -245,7 +250,7 @@ public class sitemapParser extends AbstractParser implements Parser { public Date lastmod(final Date dflt) { try { - return ISO8601Formatter.FORMATTER.parse(this.lastmod); + return ISO8601Formatter.FORMATTER.parse(this.lastmod, 0).getTime(); } catch (final ParseException e) { return dflt; } diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index ac1c9c2ce..502782b3b 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -56,8 +56,13 @@ public class swfParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index e9bdb96bc..52a84e296 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -62,16 +62,22 @@ public class tarParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + InputStream source) throws Parser.Failure, InterruptedException { final List docacc = new ArrayList(); Document[] subDocs = null; - final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); + final String ext = MultiProtocolURL.getFileExtension(location.getFileName()); if (ext.equals("gz") || ext.equals("tgz")) { try { source = new GZIPInputStream(source); } catch (final IOException e) { - throw new Parser.Failure("tar parser: " + e.getMessage(), url); + throw new Parser.Failure("tar parser: " + e.getMessage(), location); } } TarEntry entry; @@ -91,7 +97,7 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, scraper, 999, tmp); + subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index abe9caed4..3b096ebf1 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -57,7 +57,13 @@ public class torrentParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { byte[] b = null; try { @@ -120,8 +126,8 @@ public class torrentParser extends AbstractParser implements Parser { try { byte[] b = FileUtils.read(new File(args[0])); torrentParser parser = new torrentParser(); - Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), new ByteArrayInputStream(b)); - Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false); + Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), 0, new ByteArrayInputStream(b)); + Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false, 0); Map w = c.words(); for (Map.Entry e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); } catch (final IOException e) { diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 107e89feb..f4c4120e2 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -66,7 +66,13 @@ public class vcfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { try { @@ -201,7 +207,7 @@ public class vcfParser extends AbstractParser implements Parser { } else { if (AbstractParser.log.isFinest()) AbstractParser.log.finest("Invalid data in vcf file" + - "\n\tURL: " + url + + "\n\tURL: " + location + "\n\tLine: " + line + "\n\tLine-Nr: " + lineNr); } @@ -212,7 +218,7 @@ public class vcfParser extends AbstractParser implements Parser { final byte[] text = UTF8.getBytes(parsedDataText.toString()); final List descriptions = new ArrayList(1); descriptions.add("vCard"); return new Document[]{new Document( - url, // url of the source document + location, // url of the source document mimeType, // the documents mime type null, // charset this, @@ -234,7 +240,7 @@ public class vcfParser extends AbstractParser implements Parser { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; - throw new Parser.Failure("Unexpected error while parsing vcf resource. " + e.getMessage(),url); + throw new Parser.Failure("Unexpected error while parsing vcf resource. " + e.getMessage(), location); } } diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java index 9e53f1085..16290f363 100644 --- a/source/net/yacy/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -67,7 +67,13 @@ public class vsdParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { Document theDoc = null; diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java index 40c925493..cf178c85e 100644 --- a/source/net/yacy/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -68,8 +68,13 @@ public class xlsParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { return new XLSHSSFListener().parse(location, mimeType, charset, source); } diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index 2438354f1..a924a6e03 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -62,12 +62,17 @@ public class zipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) - throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), url); + throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), location); Document[] docs = null; final List docacc = new ArrayList(); @@ -88,9 +93,9 @@ public class zipParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(zis, tmp, entry.getSize()); - final DigestURL virtualURL = DigestURL.newURL(url, "#" + name); + final DigestURL virtualURL = DigestURL.newURL(location, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); - docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, 999, tmp); + docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp); if (docs == null) continue; for (final Document d: docs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/http/ProxyCacheHandler.java b/source/net/yacy/http/ProxyCacheHandler.java index d5417b641..50a200025 100644 --- a/source/net/yacy/http/ProxyCacheHandler.java +++ b/source/net/yacy/http/ProxyCacheHandler.java @@ -74,7 +74,8 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler "", cachedResponseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), - 0); + 0, + sb.crawler.defaultProxyProfile.timezoneOffset()); final Response cachedResponse = new Response( yacyRequest, diff --git a/source/net/yacy/http/ProxyHandler.java b/source/net/yacy/http/ProxyHandler.java index 2658e031a..d558aac0a 100644 --- a/source/net/yacy/http/ProxyHandler.java +++ b/source/net/yacy/http/ProxyHandler.java @@ -180,7 +180,8 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler { "", responseHeaderLegacy.lastModified(), sb.crawler.defaultProxyProfile.handle(), - 0); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); + 0, + sb.crawler.defaultProxyProfile.timezoneOffset()); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); final Response yacyResponse = new Response( yacyRequest, null, diff --git a/source/net/yacy/http/servlets/SolrSelectServlet.java b/source/net/yacy/http/servlets/SolrSelectServlet.java index 36fc7aa80..bba3de81f 100644 --- a/source/net/yacy/http/servlets/SolrSelectServlet.java +++ b/source/net/yacy/http/servlets/SolrSelectServlet.java @@ -137,7 +137,7 @@ public class SolrSelectServlet extends HttpServlet { if (!mmsp.getMap().containsKey(CommonParams.Q) && mmsp.getMap().containsKey(CommonParams.QUERY)) { querystring = mmsp.get(CommonParams.QUERY, ""); mmsp.getMap().remove(CommonParams.QUERY); - QueryModifier modifier = new QueryModifier(); + QueryModifier modifier = new QueryModifier(0); querystring = modifier.parse(querystring); modifier.apply(mmsp); QueryGoal qg = new QueryGoal(querystring); diff --git a/source/net/yacy/kelondro/blob/ArrayStack.java b/source/net/yacy/kelondro/blob/ArrayStack.java index bf2e1d781..f577cc119 100644 --- a/source/net/yacy/kelondro/blob/ArrayStack.java +++ b/source/net/yacy/kelondro/blob/ArrayStack.java @@ -172,7 +172,7 @@ public class ArrayStack implements BLOB { f.delete(); deletions = true; } else try { - d = GenericFormatter.SHORT_SECOND_FORMATTER.parse(file.substring(0, 14)); + d = GenericFormatter.SHORT_SECOND_FORMATTER.parse(file.substring(0, 14), 0).getTime(); f.renameTo(newBLOB(d)); deletions = true; } catch (final ParseException e) {continue;} @@ -188,7 +188,7 @@ public class ArrayStack implements BLOB { for (final String file : files) { if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) { try { - d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18)); + d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime(); time = d.getTime(); if (time > maxtime) maxtime = time; } catch (final ParseException e) {continue;} @@ -199,7 +199,7 @@ public class ArrayStack implements BLOB { for (final String file : files) { if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) { try { - d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18)); + d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime(); f = new File(heapLocation, file); time = d.getTime(); try { @@ -253,7 +253,7 @@ public class ArrayStack implements BLOB { public synchronized void mountBLOB(final File location, final boolean full) throws IOException { Date d; try { - d = my_SHORT_MILSEC_FORMATTER.parse(location.getName().substring(this.prefix.length() + 1, this.prefix.length() + 18)); + d = my_SHORT_MILSEC_FORMATTER.parse(location.getName().substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime(); } catch (final ParseException e) { throw new IOException("date parse problem with file " + location.toString() + ": " + e.getMessage()); } diff --git a/source/net/yacy/kelondro/blob/BEncodedHeapBag.java b/source/net/yacy/kelondro/blob/BEncodedHeapBag.java index 3b7ae63c9..1c55cb8d3 100644 --- a/source/net/yacy/kelondro/blob/BEncodedHeapBag.java +++ b/source/net/yacy/kelondro/blob/BEncodedHeapBag.java @@ -95,7 +95,7 @@ public class BEncodedHeapBag extends AbstractMapStore implements MapStore { (element.length() == this.prefix.length() + 23)) { f = new File(this.baseDir, element); try { - d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18)); + d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime(); } catch (final ParseException e) { ConcurrentLog.severe("BEncodedHeapBag", "", e); continue; @@ -203,7 +203,7 @@ public class BEncodedHeapBag extends AbstractMapStore implements MapStore { final String name = heap.getFile().getName(); long d; try { - d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18)).getTime(); + d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime().getTime(); } catch (final ParseException e) { ConcurrentLog.severe("BEncodedHeapBag", "", e); d = 0; diff --git a/source/net/yacy/kelondro/blob/Tables.java b/source/net/yacy/kelondro/blob/Tables.java index 7dc399db8..bd9de5329 100644 --- a/source/net/yacy/kelondro/blob/Tables.java +++ b/source/net/yacy/kelondro/blob/Tables.java @@ -764,7 +764,7 @@ public class Tables implements Iterable { final byte[] r = this.get(colname); if (r == null) return dflt; try { - return my_SHORT_MILSEC_FORMATTER.parse(UTF8.String(r)); + return my_SHORT_MILSEC_FORMATTER.parse(UTF8.String(r), 0).getTime(); } catch (final ParseException e) { return dflt; } diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 17f2c772d..cbe6ccc52 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -107,17 +107,17 @@ public class URIMetadataNode extends SolrDocument { final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); try { - this.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"))); + this.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"), 0).getTime()); } catch (final ParseException e) { this.setField(CollectionSchema.last_modified.name(), new Date()); } try { - this.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"))); + this.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"), 0).getTime()); } catch (final ParseException e) { this.setField(CollectionSchema.load_date_dt.name(), new Date()); } try { - this.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"))); + this.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"), 0).getTime()); } catch (final ParseException e) { this.setField(CollectionSchema.fresh_date_dt.name(), new Date()); } diff --git a/source/net/yacy/kelondro/table/SplitTable.java b/source/net/yacy/kelondro/table/SplitTable.java index a70c0ff1f..ca8bbf90e 100644 --- a/source/net/yacy/kelondro/table/SplitTable.java +++ b/source/net/yacy/kelondro/table/SplitTable.java @@ -179,7 +179,7 @@ public class SplitTable implements Index, Iterable { (element.length() == this.prefix.length() + 24)) { f = new File(this.path, element); try { - d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18)); + d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime(); } catch (final ParseException e) { ConcurrentLog.severe("SplitTable", "", e); continue; @@ -372,7 +372,7 @@ public class SplitTable implements Index, Iterable { final String name = new File(table.filename()).getName(); long d; try { - d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18)).getTime(); + d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime().getTime(); } catch (final ParseException e) { ConcurrentLog.severe("SplitTable", "", e); d = 0; diff --git a/source/net/yacy/peers/NewsDB.java b/source/net/yacy/peers/NewsDB.java index f6926d512..e2dbbde26 100644 --- a/source/net/yacy/peers/NewsDB.java +++ b/source/net/yacy/peers/NewsDB.java @@ -46,6 +46,8 @@ package net.yacy.peers; import java.io.File; import java.io.IOException; +import java.text.ParseException; +import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.Iterator; @@ -164,10 +166,16 @@ public class NewsDB { private Record b2r(final Row.Entry b) { if (b == null) return null; + Calendar c; + try { + c = b.empty(2) ? null : my_SHORT_SECOND_FORMATTER.parse(b.getColASCII(2), 0); + } catch (ParseException e) { + c = null; + } return new NewsDB.Record( b.getPrimaryKeyASCII(), b.getColUTF8(1), - (b.empty(2)) ? null : my_SHORT_SECOND_FORMATTER.parse(b.getColASCII(2), GenericFormatter.UTCDiffString()), + c == null ? null : c.getTime(), (int) b.getColLong(3), MapTools.string2map(b.getColUTF8(4), ",") ); @@ -226,8 +234,8 @@ public class NewsDB { public class Record { private final String originator; // hash of originating peer - private final Date created; // Date when news was created by originator - private final Date received; // Date when news was received here at this peer + private Date created; // Date when news was created by originator + private Date received; // Date when news was received here at this peer private final String category; // keyword that addresses possible actions private int distributed; // counter that counts number of distributions of this news record private final Map attributes; // elements of the news for a special category @@ -238,8 +246,16 @@ public class NewsDB { if (this.attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + this.attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")"); this.category = (this.attributes.containsKey("cat")) ? this.attributes.get("cat") : ""; if (this.category.length() > NewsDB.categoryStringLength) throw new IllegalArgumentException("category length (" + this.category.length() + ") exceeds maximum (" + NewsDB.categoryStringLength + ")"); - this.received = (this.attributes.containsKey("rec")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("rec"), GenericFormatter.UTCDiffString()) : new Date(); - this.created = (this.attributes.containsKey("cre")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("cre"), GenericFormatter.UTCDiffString()) : new Date(); + try { + this.received = (this.attributes.containsKey("rec")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("rec"), 0).getTime() : new Date(); + } catch (ParseException e) { + this.received = new Date(); + } + try { + this.created = (this.attributes.containsKey("cre")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("cre"), 0).getTime() : new Date(); + } catch (ParseException e) { + this.created = new Date(); + } this.distributed = (this.attributes.containsKey("dis")) ? Integer.parseInt(this.attributes.get("dis")) : 0; this.originator = (this.attributes.containsKey("ori")) ? this.attributes.get("ori") : ""; removeStandards(); @@ -262,7 +278,11 @@ public class NewsDB { if (attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")"); this.attributes = attributes; this.received = received; - this.created = my_SHORT_SECOND_FORMATTER.parse(id.substring(0, GenericFormatter.PATTERN_SHORT_SECOND.length()), GenericFormatter.UTCDiffString()); + try { + this.created = my_SHORT_SECOND_FORMATTER.parse(id.substring(0, GenericFormatter.PATTERN_SHORT_SECOND.length()), 0).getTime(); + } catch (ParseException e) { + this.created = new Date(); + } this.category = category; this.distributed = distributed; this.originator = id.substring(GenericFormatter.PATTERN_SHORT_SECOND.length()); diff --git a/source/net/yacy/peers/Seed.java b/source/net/yacy/peers/Seed.java index 64c5f9938..c7e44bf79 100644 --- a/source/net/yacy/peers/Seed.java +++ b/source/net/yacy/peers/Seed.java @@ -797,7 +797,7 @@ public class Seed implements Cloneable, Comparable, Comparator try { final GenericFormatter my_SHORT_SECOND_FORMATTER = new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); // use our own formatter to prevent concurrency locks with other processes - final long t = my_SHORT_SECOND_FORMATTER.parse(get(Seed.LASTSEEN, "20040101000000")).getTime(); + final long t = my_SHORT_SECOND_FORMATTER.parse(get(Seed.LASTSEEN, "20040101000000"), 0).getTime().getTime(); // getTime creates a UTC time number. But in this case java thinks, that the given // time string is a local time, which has a local UTC offset applied. // Therefore java subtracts the local UTC offset, to get a UTC number. @@ -831,7 +831,7 @@ public class Seed implements Cloneable, Comparable, Comparator try { final GenericFormatter my_SHORT_SECOND_FORMATTER = new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); // use our own formatter to prevent concurrency locks with other processes - b = my_SHORT_SECOND_FORMATTER.parse(get(Seed.BDATE, "20040101000000")).getTime(); + b = my_SHORT_SECOND_FORMATTER.parse(get(Seed.BDATE, "20040101000000"), 0).getTime().getTime(); } catch (final ParseException e ) { b = System.currentTimeMillis(); } diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index d6b7f3139..5c3bea554 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -503,7 +503,7 @@ public class WebStructureGraph { hr = new HostReference( ASCII.getBytes(sentry.hosthash), - GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(), + GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date, 0).getTime().getTime(), refhosthashandcounter.getValue().intValue()); } catch (final ParseException e ) { continue refloop; diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 84a01a08d..1da658f65 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -112,21 +112,24 @@ public final class LoaderDispatcher { final boolean forText, final boolean global ) { + CrawlProfile profile = + (forText) ? + ((global) ? + this.sb.crawler.defaultTextSnippetGlobalProfile : + this.sb.crawler.defaultTextSnippetLocalProfile) + : + ((global) ? + this.sb.crawler.defaultMediaSnippetGlobalProfile : + this.sb.crawler.defaultMediaSnippetLocalProfile); return new Request( ASCII.getBytes(this.sb.peers.mySeed().hash), url, null, "", new Date(), - (forText) ? - ((global) ? - this.sb.crawler.defaultTextSnippetGlobalProfile.handle() : - this.sb.crawler.defaultTextSnippetLocalProfile.handle()) - : - ((global) ? - this.sb.crawler.defaultMediaSnippetGlobalProfile.handle() : - this.sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile - 0); + profile.handle(), + 0, + profile.timezoneOffset()); } public void load(final DigestURL url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException { @@ -407,7 +410,7 @@ public final class LoaderDispatcher { * @return a map from URLs to the anchor texts of the urls * @throws IOException */ - public final Map loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { + public final Map loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent, final int timezoneOffset) throws IOException { final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); @@ -418,7 +421,7 @@ public final class LoaderDispatcher { final String supportError = TextParser.supports(url, responseHeader.mime()); if (supportError != null) throw new IOException("no parser support: " + supportError); try { - documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), response.depth(), response.getContent()); + documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent()); if (documents == null) throw new IOException("document == null"); } catch (final Exception e) { throw new IOException("parser error: " + e.getMessage()); diff --git a/source/net/yacy/search/EventTracker.java b/source/net/yacy/search/EventTracker.java index 2479e285c..bba8b335f 100644 --- a/source/net/yacy/search/EventTracker.java +++ b/source/net/yacy/search/EventTracker.java @@ -152,7 +152,7 @@ public class EventTracker { } public long getTime() { if (this.time instanceof String) try { - return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time).getTime(); + return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time, 0).getTime().getTime(); } catch (ParseException e) { return -1L; } @@ -162,7 +162,7 @@ public class EventTracker { } public Date getDate() { if (this.time instanceof String) try { - return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time); + return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time, 0).getTime(); } catch (ParseException e) { return null; }if (this.time instanceof Long) return new Date((Long) this.time); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index ad9724d44..c1b29eb95 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1942,7 +1942,8 @@ public final class Switchboard extends serverSwitch { "", surrogate.getDate(), this.crawler.defaultSurrogateProfile.handle(), - 0); + 0, + this.crawler.defaultSurrogateProfile.timezoneOffset()); response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false, null); final IndexingQueueEntry queueEntry = new IndexingQueueEntry(response, new Document[] {document}, null); @@ -2571,6 +2572,7 @@ public final class Switchboard extends serverSwitch { response.getMimeType(), response.getCharacterEncoding(), response.profile().scraper(), + response.profile().timezoneOffset(), response.depth(), response.getContent()); if ( documents == null ) { @@ -2673,7 +2675,8 @@ public final class Switchboard extends serverSwitch { nextEntry.getValue(), new Date(), response.profile().handle(), - nextdepth)); + nextdepth, + response.profile().timezoneOffset())); } catch (final MalformedURLException e ) { ConcurrentLog.logException(e); } @@ -2754,7 +2757,8 @@ public final class Switchboard extends serverSwitch { in.documents[i], in.queueEntry.profile().scraper(), in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib, true, - this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts)); + this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts), + profile.timezoneOffset()); // update image result list statistics // its good to do this concurrently here, because it needs a DNS lookup @@ -3043,7 +3047,15 @@ public final class Switchboard extends serverSwitch { int p = userInfo == null ? -1 : userInfo.indexOf(':'); String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p); String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1); - this.crawlStacker.enqueueEntriesFTP(this.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), user, pw, false); + this.crawlStacker.enqueueEntriesFTP( + this.peers.mySeed().hash.getBytes(), + profile.handle(), + url.getHost(), + url.getPort(), + user, + pw, + false, + profile.timezoneOffset()); return null; } catch (final Exception e) { // mist @@ -3080,7 +3092,8 @@ public final class Switchboard extends serverSwitch { "CRAWLING-ROOT", new Date(), profile.handle(), - 0 + 0, + profile.timezoneOffset() )); if (reasonString != null) return reasonString; @@ -3134,7 +3147,7 @@ public final class Switchboard extends serverSwitch { * @throws IOException * @throws Parser.Failure */ - public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections, boolean doublecheck) { + public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections, final boolean doublecheck) { Map urlmap = new HashMap(); for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url); if (searchEvent != null) { @@ -3192,7 +3205,7 @@ public final class Switchboard extends serverSwitch { } final Condenser condenser = new Condenser( document, null, true, true, LibraryProvider.dymLib, true, - Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts)); + Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts), searchEvent.query.timezoneOffset); ResultImages.registerImages(url, document, true); Switchboard.this.webStructure.generateCitationReference(url, document); storeDocumentIndex( @@ -3546,7 +3559,7 @@ public final class Switchboard extends serverSwitch { final Map links; searchEvent.oneFeederStarted(); try { - links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); + links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent, searchEvent.query.timezoneOffset); if ( links != null ) { final Iterator i = links.keySet().iterator(); while ( i.hasNext() ) { @@ -3585,7 +3598,7 @@ public final class Switchboard extends serverSwitch { final Map links; DigestURL url; try { - links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); + links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent, 0); if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages final Iterator i = links.keySet().iterator(); diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index a8ef16402..aa805c4b7 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -61,18 +61,27 @@ public class DocumentIndex extends Segment { } catch (final MalformedURLException e ) { } } - BlockingQueue queue; // a queue of document ID's + private BlockingQueue queue; // a queue of document ID's private final Worker[] worker; - CallbackListener callback; + private CallbackListener callback; + private int timezoneOffset; static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup"); - public DocumentIndex(final File segmentPath, final File archivePath, final File collectionConfigurationPath, final File webgraphConfigurationPath, final CallbackListener callback, final int cachesize) + public DocumentIndex( + final File segmentPath, + final File archivePath, + final File collectionConfigurationPath, + final File webgraphConfigurationPath, + final CallbackListener callback, + final int cachesize, + final int timezoneOffset) throws IOException { super(new ConcurrentLog("DocumentIndex"), segmentPath, archivePath, collectionConfigurationPath == null ? null : new CollectionConfiguration(collectionConfigurationPath, true), webgraphConfigurationPath == null ? null : new WebgraphConfiguration(webgraphConfigurationPath, true) ); + this.timezoneOffset = timezoneOffset; super.connectRWI(cachesize, targetFileSize * 4 - 1); super.connectCitation(cachesize, targetFileSize * 4 - 1); super.fulltext().connectLocalSolr(); @@ -99,7 +108,7 @@ public class DocumentIndex extends Segment { try { while ( (f = DocumentIndex.this.queue.take()) != poison ) { try { - resultRows = add(f); + resultRows = add(f, DocumentIndex.this.timezoneOffset); for ( final SolrInputDocument resultRow : resultRows ) { if ( DocumentIndex.this.callback != null ) { if ( resultRow == null ) { @@ -132,7 +141,7 @@ public class DocumentIndex extends Segment { this.queue.clear(); } - private SolrInputDocument[] add(final AnchorURL url) throws IOException { + private SolrInputDocument[] add(final AnchorURL url, final int timezoneOffset) throws IOException { if ( url == null ) { throw new IOException("file = null"); } @@ -150,7 +159,7 @@ public class DocumentIndex extends Segment { length = -1; } try { - documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); + documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), timezoneOffset, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); } catch (final Exception e ) { throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage()); } @@ -159,7 +168,7 @@ public class DocumentIndex extends Segment { int c = 0; for ( final Document document : documents ) { if (document == null) continue; - final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true); + final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true, 0); rows[c++] = super.storeDocument( url, diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index b5bd460e2..236be7537 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -761,7 +761,7 @@ public class Segment { } // get the word set Set words = null; - words = new Condenser(document, null, true, true, null, false, false).words().keySet(); + words = new Condenser(document, null, true, true, null, false, false, 0).words().keySet(); // delete all word references int count = 0; diff --git a/source/net/yacy/search/query/AccessTracker.java b/source/net/yacy/search/query/AccessTracker.java index 07d379873..b050ee4ae 100644 --- a/source/net/yacy/search/query/AccessTracker.java +++ b/source/net/yacy/search/query/AccessTracker.java @@ -315,7 +315,7 @@ public class AccessTracker { byte[] b = new byte[GenericFormatter.PATTERN_SHORT_SECOND.length()]; raf.readFully(b); try { - return GenericFormatter.SHORT_SECOND_FORMATTER.parse(UTF8.String(b)); + return GenericFormatter.SHORT_SECOND_FORMATTER.parse(UTF8.String(b), 0).getTime(); } catch (ParseException e) { throw new IOException(e.getMessage()); } @@ -326,8 +326,8 @@ public class AccessTracker { String file = args[0]; Date from; try { - from = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[1]); - Date to = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[2]); + from = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[1], 0).getTime(); + Date to = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[2], 0).getTime(); List dump = readLog(new File(file), from, to); for (EventTracker.Event s: dump) System.out.println(s.toString()); } catch (ParseException e) { diff --git a/source/net/yacy/search/query/QueryModifier.java b/source/net/yacy/search/query/QueryModifier.java index e7daf4acb..0cb0f6942 100644 --- a/source/net/yacy/search/query/QueryModifier.java +++ b/source/net/yacy/search/query/QueryModifier.java @@ -41,8 +41,10 @@ public class QueryModifier { private final StringBuilder modifier; public String sitehost, sitehash, filetype, protocol, language, author, collection, on, from, to; + public int timezoneOffset; - public QueryModifier() { + public QueryModifier(final int timezoneOffset) { + this.timezoneOffset = timezoneOffset; this.sitehash = null; this.sitehost = null; this.filetype = null; @@ -274,19 +276,19 @@ public class QueryModifier { if (fq.indexOf(CollectionSchema.dates_in_content_dts.getSolrFieldName()) < 0) { if (this.on != null && this.on.length() > 0) { - fq.append(" AND ").append(QueryModifier.parseOnExpression(this.on)); + fq.append(" AND ").append(QueryModifier.parseOnExpression(this.on, this.timezoneOffset)); } if (this.from != null && this.from.length() > 0 && (this.to == null || this.to.equals("*"))) { - fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, null)); + fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, null, this.timezoneOffset)); } if ((this.from == null || this.from.equals("*")) && this.to != null && this.to.length() > 0) { - fq.append(" AND ").append(QueryModifier.parseFromToExpression(null, this.to)); + fq.append(" AND ").append(QueryModifier.parseFromToExpression(null, this.to, this.timezoneOffset)); } if (this.from != null && this.from.length() > 0 && this.to != null && this.to.length() > 0) { - fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, this.to)); + fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, this.to, this.timezoneOffset)); } } @@ -348,9 +350,9 @@ public class QueryModifier { return fq.toString(); } - public static String parseOnExpression(String onDescription) { + public static String parseOnExpression(final String onDescription, final int timezoneOffset) { assert onDescription != null; - Date onDate = DateDetection.parseLine(onDescription); + Date onDate = DateDetection.parseLine(onDescription, timezoneOffset); StringBuilder filterQuery = new StringBuilder(20); if (onDate != null) { @SuppressWarnings({ "deprecation", "static-access" }) @@ -360,9 +362,9 @@ public class QueryModifier { return filterQuery.toString(); } - public static String parseFromToExpression(String from, String to) { - Date fromDate = from == null || from.equals("*") ? null : DateDetection.parseLine(from); - Date toDate = to == null || to.equals("*") ? null : DateDetection.parseLine(to); + public static String parseFromToExpression(final String from, final String to, final int timezoneOffset) { + Date fromDate = from == null || from.equals("*") ? null : DateDetection.parseLine(from, timezoneOffset); + Date toDate = to == null || to.equals("*") ? null : DateDetection.parseLine(to, timezoneOffset); StringBuilder filterQuery = new StringBuilder(20); if (fromDate != null && toDate != null) { @SuppressWarnings({ "deprecation", "static-access" }) diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index d99d524b9..5adfbc0dc 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -70,7 +70,6 @@ import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; -import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.DisMaxParams; import org.apache.solr.common.params.FacetParams; import org.apache.solr.schema.TrieDateField; @@ -146,6 +145,7 @@ public final class QueryParams { public LinkedHashSet facetfields; private SolrQuery cachedQuery; private CollectionConfiguration solrSchema; + public final int timezoneOffset; public QueryParams( final QueryGoal queryGoal, @@ -154,6 +154,7 @@ public final class QueryParams { final String prefer, final ContentDomain contentdom, final String language, + final int timezoneOffset, final Collection metatags, final CacheStrategy snippetCacheStrategy, final int itemsPerPage, @@ -183,6 +184,7 @@ public final class QueryParams { this.ranking = ranking; this.maxDistance = maxDistance; this.contentdom = contentdom; + this.timezoneOffset = timezoneOffset; this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage); this.offset = Math.max(0, Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset)); try { @@ -527,19 +529,19 @@ public final class QueryParams { if (this.solrSchema.contains(CollectionSchema.dates_in_content_dts)) { if (this.modifier.on != null && this.modifier.on.length() > 0) { - fqs.add(QueryModifier.parseOnExpression(this.modifier.on)); + fqs.add(QueryModifier.parseOnExpression(this.modifier.on, this.timezoneOffset)); } if (this.modifier.from != null && this.modifier.from.length() > 0 && (this.modifier.to == null || this.modifier.to.equals("*"))) { - fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, null)); + fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, null, this.timezoneOffset)); } if ((this.modifier.from == null || this.modifier.from.equals("*")) && this.modifier.to != null && this.modifier.to.length() > 0) { - fqs.add(QueryModifier.parseFromToExpression(null, this.modifier.to)); + fqs.add(QueryModifier.parseFromToExpression(null, this.modifier.to, this.timezoneOffset)); } if (this.modifier.from != null && this.modifier.from.length() > 0 && this.modifier.to != null && this.modifier.to.length() > 0) { - fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, this.modifier.to)); + fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, this.modifier.to, this.timezoneOffset)); } } diff --git a/source/net/yacy/server/http/HTTPDProxyHandler.java b/source/net/yacy/server/http/HTTPDProxyHandler.java index 3463a0552..533b37dc3 100644 --- a/source/net/yacy/server/http/HTTPDProxyHandler.java +++ b/source/net/yacy/server/http/HTTPDProxyHandler.java @@ -358,7 +358,8 @@ public final class HTTPDProxyHandler { "", cachedResponseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), - 0); + 0, + sb.crawler.defaultProxyProfile.timezoneOffset()); final Response response = new Response( request, requestHeader, @@ -473,8 +474,8 @@ public final class HTTPDProxyHandler { "", responseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), - 0); - + 0, + sb.crawler.defaultProxyProfile.timezoneOffset()); // handle incoming cookies handleIncomingCookies(responseHeader, host, ip);