diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index a68125815..90b83ef59 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -513,7 +513,7 @@
- Index Administration + Index Attributes
Indexing
@@ -561,6 +561,17 @@
+
+
+ info + The time zone is required when the parser detects a date in the crawled web page. Content can be searched with the on: - modifier which + requires also a time zone when a query is made. To normalize all given dates, the date is stored in UTC time zone. To get the right offset + from dates without time zones to UTC, this offset must be given here. The offset is given in minutes; + Time zone offsets for locations east of UTC must be negative; offsets for zones west of UTC must be positve. + + +
+
diff --git a/htroot/CrawlStartSite.html b/htroot/CrawlStartSite.html index dddbc4ff2..8127e7770 100644 --- a/htroot/CrawlStartSite.html +++ b/htroot/CrawlStartSite.html @@ -91,6 +91,7 @@ + diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 9a6e786de..8b0e39801 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -470,6 +470,8 @@ public class Crawler_p { } } + int timezoneOffset = post.getInt("timezoneOffset", 0); + // prepare a new crawling profile final CrawlProfile profile; byte[] handle; @@ -502,7 +504,8 @@ public class Crawler_p { cachePolicy, collection, agentName, - new VocabularyScraper(vocabulary_scraper)); + new VocabularyScraper(vocabulary_scraper), + timezoneOffset); handle = ASCII.getBytes(profile.handle()); // before we fire up a new crawl, we make sure that another crawl with the same name is not running @@ -585,7 +588,7 @@ public class Crawler_p { try { // check if the crawl filter works correctly Pattern.compile(newcrawlingMustMatch); - final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper()); + final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset); final Writer writer = new TransformerWriter(null, null, scraper, null, false); if (crawlingFile != null && crawlingFile.exists()) { FileUtils.copy(new FileInputStream(crawlingFile), writer); @@ -605,7 +608,7 @@ public class Crawler_p { } sb.crawler.putActive(handle, profile); - sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks); + sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, profile.timezoneOffset()); } catch (final PatternSyntaxException e) { prop.put("info", "4"); // crawlfilter does not match url prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 61629bb50..bb63d90a3 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -161,7 +161,8 @@ public class HostBrowser { sb.peers.mySeed().hash.getBytes(), url, null, load, new Date(), sb.crawler.defaultProxyProfile.handle(), - 0 + 0, + sb.crawler.defaultProxyProfile.timezoneOffset() )); prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString)); if (wait) waitloop: for (int i = 0; i < 30; i++) { diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 98c8c317b..a7e13a0b8 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -637,11 +637,12 @@ public class IndexControlRWIs_p { final QueryGoal qg = new QueryGoal(queryhashes, null); final QueryParams query = new QueryParams( qg, - new QueryModifier(), + new QueryModifier(0), Integer.MAX_VALUE, "", ContentDomain.ALL, "", //lang + 0, //timezoneOffset null, CacheStrategy.IFFRESH, 1000, 0, //count, offset diff --git a/htroot/NetworkHistory.java b/htroot/NetworkHistory.java index ef7c329df..cc723ef89 100644 --- a/htroot/NetworkHistory.java +++ b/htroot/NetworkHistory.java @@ -74,7 +74,7 @@ public class NetworkHistory { while (rowi.hasNext()) { Row row = rowi.next(); String d = ASCII.String(row.getPK()); - Date date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d); + Date date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d, 0).getTime(); if (date.getTime() < timelimit) break; statrow = new HashMap<>(); for (String key: columns) { diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 106b10151..2b0b599b8 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -128,7 +128,8 @@ public class QuickCrawlLink_p { final byte[] urlhash = crawlingStartURL.hash(); indexSegment.fulltext().remove(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash); - + int timezoneOffset = post.getInt("timezoneOffset", 0); + // create crawling profile CrawlProfile pe = null; try { @@ -156,7 +157,8 @@ public class QuickCrawlLink_p { CacheStrategy.IFFRESH, collection, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + timezoneOffset); sb.crawler.putActive(pe.handle().getBytes(), pe); } catch (final Exception e) { // mist @@ -175,7 +177,8 @@ public class QuickCrawlLink_p { (title==null)?"CRAWLING-ROOT":title, new Date(), pe.handle(), - 0 + 0, + pe.timezoneOffset() )); // validate rejection reason diff --git a/htroot/api/bookmarks/posts/get.java b/htroot/api/bookmarks/posts/get.java index f95cd391c..fabc9b38b 100644 --- a/htroot/api/bookmarks/posts/get.java +++ b/htroot/api/bookmarks/posts/get.java @@ -39,7 +39,7 @@ public class get { Date parsedDate = null; try { - parsedDate = ISO8601Formatter.FORMATTER.parse(date); + parsedDate = ISO8601Formatter.FORMATTER.parse(date, 0).getTime(); } catch (final ParseException e) { parsedDate = new Date(); } diff --git a/htroot/api/push_p.java b/htroot/api/push_p.java index a78e1d776..84689af62 100644 --- a/htroot/api/push_p.java +++ b/htroot/api/push_p.java @@ -103,7 +103,8 @@ public class push_p { "", // the name of the document to crawl new Date(), // current date profile.handle(), // the name of the prefetch profile. This must not be null! - 0); // forkfactor sum of anchors of all ancestors + 0, // forkfactor sum of anchors of all ancestors + profile.timezoneOffset()); Response response = new Response( request, requestHeader, diff --git a/htroot/api/timeline_p.java b/htroot/api/timeline_p.java index 9a129edbc..b9e4991b0 100644 --- a/htroot/api/timeline_p.java +++ b/htroot/api/timeline_p.java @@ -75,8 +75,8 @@ public final class timeline_p { // get a time period Date fromDate = new Date(0); Date toDate = new Date(); - try {fromDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("from", "20031215182700"));} catch (ParseException e) {} - try {toDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("to", GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date())));} catch (ParseException e) {} + try {fromDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("from", "20031215182700"), 0).getTime();} catch (ParseException e) {} + try {toDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("to", GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date())), 0).getTime();} catch (ParseException e) {} // get latest dump; AccessTracker.dumpLog(); diff --git a/htroot/index.html b/htroot/index.html index b92b46652..ba3a2544e 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -80,6 +80,7 @@ + :: diff --git a/htroot/rct_p.java b/htroot/rct_p.java index e32092485..4fb381ac0 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -78,7 +78,8 @@ public class rct_p { "REMOTE-CRAWLING", loaddate, sb.crawler.defaultRemoteProfile.handle(), - 0)); + 0, + sb.crawler.defaultRemoteProfile.timezoneOffset())); } else { env.getLog().warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); } diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 58dae90da..a5ce1170b 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -118,7 +118,8 @@ public final class search { final String prefer = post.get("prefer", ""); final String contentdom = post.get("contentdom", "all"); final String filter = post.get("filter", ".*"); // a filter on the url - QueryModifier modifier = new QueryModifier(); + final int timezoneOffset = post.getInt("timezoneOffset", 0); + QueryModifier modifier = new QueryModifier(timezoneOffset); modifier.sitehost = post.get("sitehost", ""); if (modifier.sitehost.isEmpty()) modifier.sitehost = null; modifier.sitehash = post.get("sitehash", ""); if (modifier.sitehash.isEmpty()) modifier.sitehash = null; modifier.author = post.get("author", ""); if (modifier.author.isEmpty()) modifier.author = null; @@ -232,6 +233,7 @@ public final class search { prefer, ContentDomain.contentdomParser(contentdom), language, + timezoneOffset, new HashSet(), null, // no snippet computation count, @@ -297,6 +299,7 @@ public final class search { prefer, ContentDomain.contentdomParser(contentdom), language, + timezoneOffset, new HashSet(), null, // no snippet computation count, diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 4b042376f..980bd276a 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -55,7 +55,7 @@ public final class transferURL { public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { final long start = System.currentTimeMillis(); long freshdate = 0; - try {freshdate = GenericFormatter.SHORT_DAY_FORMATTER.parse("20061101").getTime();} catch (final ParseException e1) {} + try {freshdate = GenericFormatter.SHORT_DAY_FORMATTER.parse("20061101", 0).getTime().getTime();} catch (final ParseException e1) {} // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; diff --git a/htroot/yacysearch.html b/htroot/yacysearch.html index afaf443ea..c9ba12167 100644 --- a/htroot/yacysearch.html +++ b/htroot/yacysearch.html @@ -108,6 +108,7 @@ Use the RSS search result format to add static searches to your RSS reader, if y + diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index a27aaf109..8494ab05e 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -214,6 +214,9 @@ public class yacysearch { prop.setOutgoingHeader(outgoingHeader); } + // time zone + int timezoneOffset = post.getInt("timezoneOffset", 0); + // collect search attributes int itemsPerPage = @@ -359,7 +362,7 @@ public class yacysearch { } final RankingProfile ranking = sb.getRanking(); - final QueryModifier modifier = new QueryModifier(); + final QueryModifier modifier = new QueryModifier(timezoneOffset); querystring = modifier.parse(querystring); if (modifier.sitehost != null && modifier.sitehost.length() > 0 && querystring.length() == 0) querystring = "*"; // allow to search for all documents on a host @@ -643,6 +646,7 @@ public class yacysearch { prefermask, contentdom, language, + timezoneOffset, metatags, snippetFetchStrategy, itemsPerPage, diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index f0b445056..a0f39d297 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -390,9 +390,9 @@ public class yacysearchtrailer { navigatorIterator = theSearch.dateNavigator.iterator(); // this iterator is different as it iterates by the key order (which is a date order) int i = 0, pos = 0, neg = 0; long dx = -1; - Date fromconstraint = theSearch.getQuery().modifier.from == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.from); + Date fromconstraint = theSearch.getQuery().modifier.from == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.from, theSearch.getQuery().timezoneOffset); if (fromconstraint == null) fromconstraint = new Date(System.currentTimeMillis() - AbstractFormatter.normalyearMillis); - Date toconstraint = theSearch.getQuery().modifier.to == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.to); + Date toconstraint = theSearch.getQuery().modifier.to == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.to, theSearch.getQuery().timezoneOffset); if (toconstraint == null) toconstraint = new Date(System.currentTimeMillis() + AbstractFormatter.normalyearMillis); while (i < QueryParams.FACETS_DATE_MAXCOUNT && navigatorIterator.hasNext()) { name = navigatorIterator.next().trim(); diff --git a/source/net/yacy/cora/date/AbstractFormatter.java b/source/net/yacy/cora/date/AbstractFormatter.java index 2a54df377..932fae059 100644 --- a/source/net/yacy/cora/date/AbstractFormatter.java +++ b/source/net/yacy/cora/date/AbstractFormatter.java @@ -25,13 +25,19 @@ package net.yacy.cora.date; import java.text.ParseException; +import java.util.Calendar; import java.util.Date; import java.util.TimeZone; public abstract class AbstractFormatter implements DateFormatter { - protected static final TimeZone TZ_GMT = TimeZone.getTimeZone("GMT"); - + public final static Calendar testCalendar = Calendar.getInstance(); // a calendar in the current time zone of the server + public final static Calendar UTCCalendar = Calendar.getInstance(); + public final static TimeZone UTCtimeZone = TimeZone.getTimeZone("UTC"); + static { + UTCCalendar.setTimeZone(UTCtimeZone); + } + // statics public final static long secondMillis = 1000; public final static long minuteMillis = 60 * secondMillis; @@ -45,7 +51,7 @@ public abstract class AbstractFormatter implements DateFormatter { protected String last_format; @Override - public abstract Date parse(String s) throws ParseException; + public abstract Calendar parse(String s, int timezoneOffset) throws ParseException; @Override public abstract String format(final Date date); @Override diff --git a/source/net/yacy/cora/date/DateFormatter.java b/source/net/yacy/cora/date/DateFormatter.java index 0e1e2e787..f929534d1 100644 --- a/source/net/yacy/cora/date/DateFormatter.java +++ b/source/net/yacy/cora/date/DateFormatter.java @@ -25,11 +25,12 @@ package net.yacy.cora.date; import java.text.ParseException; +import java.util.Calendar; import java.util.Date; public interface DateFormatter { - public Date parse(String s) throws ParseException; + public Calendar parse(String s, int timezoneOffset) throws ParseException; public String format(final Date date); public String format(); diff --git a/source/net/yacy/cora/date/GenericFormatter.java b/source/net/yacy/cora/date/GenericFormatter.java index e824f383d..16c6084d2 100644 --- a/source/net/yacy/cora/date/GenericFormatter.java +++ b/source/net/yacy/cora/date/GenericFormatter.java @@ -30,6 +30,7 @@ import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.Locale; +import java.util.TimeZone; import net.yacy.cora.util.NumberTools; @@ -51,14 +52,11 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter public static final SimpleDateFormat FORMAT_ANSIC = new SimpleDateFormat(PATTERN_ANSIC, Locale.US); public static final SimpleDateFormat FORMAT_SIMPLE = new SimpleDateFormat(PATTERN_SIMPLE, Locale.US); - // find out time zone and DST offset - private static Calendar thisCalendar = Calendar.getInstance(); - static { // we want GMT times on the formats as well as they don't support any timezone - FORMAT_SHORT_DAY.setTimeZone(TZ_GMT); - FORMAT_SHORT_SECOND.setTimeZone(TZ_GMT); - FORMAT_SHORT_MILSEC.setTimeZone(TZ_GMT); + FORMAT_SHORT_DAY.setTimeZone(UTCtimeZone); + FORMAT_SHORT_SECOND.setTimeZone(UTCtimeZone); + FORMAT_SHORT_MILSEC.setTimeZone(UTCtimeZone); } public static final long time_second = 1000L; @@ -124,56 +122,55 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter * the String. */ @Override - public Date parse(final String timeString) throws ParseException { + public Calendar parse(final String timeString, final int timezoneOffset) throws ParseException { synchronized (this.dateFormat) { - return this.dateFormat.parse(timeString); + Calendar cal = Calendar.getInstance(UTCtimeZone); + cal.setTime(this.dateFormat.parse(timeString)); + cal.add(Calendar.MINUTE, timezoneOffset); // add a correction; i.e. for UTC+1 -60 minutes is added to patch a time given in UTC+1 to the actual time at UTC + return cal; } } - + /** * Like {@link #parseShortSecond(String)} using additional timezone information provided in an * offset String, like "+0100" for CET. + * @throws ParseException */ - public Date parse(final String timeString, final String UTCOffset) { + public Calendar parse(final String timeString, final String UTCOffset) throws ParseException { // FIXME: This method returns an incorrect date, check callers! // ex: de.anomic.server.serverDate.parseShortSecond("20070101120000", "+0200").toGMTString() // => 1 Jan 2007 13:00:00 GMT - if (timeString == null || timeString.isEmpty()) { return new Date(); } - if (UTCOffset == null || UTCOffset.isEmpty()) { return new Date(); } - try { - synchronized (this.dateFormat) { - return new Date(this.dateFormat.parse(timeString).getTime() - UTCDiff() + UTCDiff(UTCOffset)); - } - } catch (final Throwable e) { - //serverLog.logFinest("parseUniversalDate", e.getMessage() + ", remoteTimeString=[" + remoteTimeString + "]"); - return new Date(); - } + if (timeString == null || timeString.isEmpty()) { return Calendar.getInstance(UTCtimeZone); } + if (UTCOffset == null || UTCOffset.isEmpty()) { return Calendar.getInstance(UTCtimeZone); } + return parse(timeString, UTCDiff(UTCOffset)); } - private static long UTCDiff(final String diffString) { + private static int UTCDiff(final String diffString) { if (diffString.length() != 5) throw new IllegalArgumentException("UTC String malformed (wrong size):" + diffString); boolean ahead = true; if (diffString.length() > 0 && diffString.charAt(0) == '+') ahead = true; else if (diffString.length() > 0 && diffString.charAt(0) == '-') ahead = false; else throw new IllegalArgumentException("UTC String malformed (wrong sign):" + diffString); - final long oh = NumberTools.parseLongDecSubstring(diffString, 1, 3); - final long om = NumberTools.parseLongDecSubstring(diffString, 3); - return ((ahead) ? (long) 1 : (long) -1) * (oh * AbstractFormatter.hourMillis + om * AbstractFormatter.minuteMillis); + final int oh = NumberTools.parseIntDecSubstring(diffString, 1, 3); + final int om = NumberTools.parseIntDecSubstring(diffString, 3); + return (int) ((ahead) ? 1 : -1 * (oh * AbstractFormatter.hourMillis + om * AbstractFormatter.minuteMillis)); } - + + /** + * get the difference of this servers time zone to UTC/GMT in milliseconds + * @return + */ private static long UTCDiff() { // DST_OFFSET is dependent on the time of the Calendar, so it has to be updated // to get the correct current offset - synchronized (thisCalendar) { - thisCalendar.setTimeInMillis(System.currentTimeMillis()); - final long zoneOffsetHours = thisCalendar.get(Calendar.ZONE_OFFSET); - final long DSTOffsetHours = thisCalendar.get(Calendar.DST_OFFSET); + synchronized (testCalendar) { + testCalendar.setTimeInMillis(System.currentTimeMillis()); + final long zoneOffsetHours = testCalendar.get(Calendar.ZONE_OFFSET); + final long DSTOffsetHours = testCalendar.get(Calendar.DST_OFFSET); return zoneOffsetHours + DSTOffsetHours; } } - - private final static DecimalFormat D2 = new DecimalFormat("00"); - + public static String UTCDiffString() { // we express the UTC Difference in 5 digits: // SHHMM @@ -195,11 +192,9 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter return sb.toString(); } - public static long correctedUTCTime() { - return System.currentTimeMillis() - UTCDiff(); - } + private final static DecimalFormat D2 = new DecimalFormat("00"); - public static void main(final String[] args) { + public static void main(String[] args) { System.out.println(UTCDiffString()); } } diff --git a/source/net/yacy/cora/date/ISO8601Formatter.java b/source/net/yacy/cora/date/ISO8601Formatter.java index 27ff6f45f..e57dfbfa6 100644 --- a/source/net/yacy/cora/date/ISO8601Formatter.java +++ b/source/net/yacy/cora/date/ISO8601Formatter.java @@ -41,7 +41,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter private static final SimpleDateFormat FORMAT_ISO8601 = new SimpleDateFormat(PATTERN_ISO8601, Locale.US); static { - FORMAT_ISO8601.setTimeZone(TZ_GMT); + FORMAT_ISO8601.setTimeZone(AbstractFormatter.UTCtimeZone); } public static final ISO8601Formatter FORMATTER = new ISO8601Formatter(); @@ -72,7 +72,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter * @throws ParseException */ @Override - public Date parse(String s) throws ParseException { + public Calendar parse(String s, final int timezoneOffset) throws ParseException { // do some lazy checks here s = s.trim(); while (!s.isEmpty() && s.endsWith("?")) s = s.substring(0, s.length() - 1); // sometimes used if write is not sure about date @@ -87,7 +87,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter while (!s.isEmpty() && s.endsWith("?")) s = s.substring(0, s.length() - 1); // sometimes used if write is not sure about date // no go for exact parsing - final Calendar cal = Calendar.getInstance(TZ_GMT, Locale.US); + final Calendar cal = Calendar.getInstance(AbstractFormatter.UTCtimeZone, Locale.US); cal.clear(); // split 2007-12-19T10:20:30.789+0500 into its parts @@ -103,13 +103,13 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter if (t.nextToken().equals("-")) { cal.set(Calendar.MONTH, Integer.parseInt(t.nextToken()) - 1); } else { - return cal.getTime(); + return cal; } // day if (t.nextToken().equals("-")) { cal.set(Calendar.DAY_OF_MONTH, Integer.parseInt(t.nextToken())); } else { - return cal.getTime(); + return cal; } // The standard says: // if there is an hour there has to be a minute and a timezone token, too. @@ -147,7 +147,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter sign = -1; } else { // no legal TZ offset found - return cal.getTime(); + return cal; } offset = sign * Integer.parseInt(t.nextToken()) * 10 * 3600; } @@ -168,8 +168,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter // in case we couldn't even parse a year if (!cal.isSet(Calendar.YEAR)) throw new ParseException("parseISO8601: Cannot parse '" + s + "'", 0); - Date d = cal.getTime(); - return d; + return cal; } diff --git a/source/net/yacy/cora/document/feed/RSSMessage.java b/source/net/yacy/cora/document/feed/RSSMessage.java index aea58547e..340d01e99 100644 --- a/source/net/yacy/cora/document/feed/RSSMessage.java +++ b/source/net/yacy/cora/document/feed/RSSMessage.java @@ -224,7 +224,7 @@ public class RSSMessage implements Hit, Comparable, Comparator hyperlinks, final boolean replace) { + private void enqueueEntries( + final byte[] initiator, + final String profileHandle, + final List hyperlinks, + final boolean replace, + final int timezoneOffset) { if (replace) { // delete old entries, if exists to force a re-load of the url (thats wanted here) Set hosthashes = new HashSet(); @@ -199,7 +208,7 @@ public final class CrawlStacker { int p = userInfo == null ? -1 : userInfo.indexOf(':'); String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p); String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1); - enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), user, pw, replace); + enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), user, pw, replace, timezoneOffset); } else { // put entry on crawl stack enqueueEntry(new Request( @@ -209,13 +218,22 @@ public final class CrawlStacker { url.getNameProperty(), new Date(), profileHandle, - 0 + 0, + timezoneOffset )); } } } - public void enqueueEntriesFTP(final byte[] initiator, final String profileHandle, final String host, final int port, final String user, final String pw, final boolean replace) { + public void enqueueEntriesFTP( + final byte[] initiator, + final String profileHandle, + final String host, + final int port, + final String user, + final String pw, + final boolean replace, + final int timezoneOffset) { final CrawlQueues cq = this.nextQueue; new Thread() { @Override @@ -248,7 +266,8 @@ public final class CrawlStacker { MultiProtocolURL.unescape(entry.name), entry.date, profileHandle, - 0)); + 0, + timezoneOffset)); } } catch (final IOException e1) { ConcurrentLog.logException(e1); @@ -272,7 +291,7 @@ public final class CrawlStacker { "CRAWLING-ROOT", new Date(), pe.handle(), - 0)); + 0, 0)); } /** diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 4472c59e0..fcce03c4b 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -296,7 +296,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_PROXY, ClientIdentification.yacyProxyAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultProxyProfile.handle()), this.defaultProxyProfile); @@ -327,7 +328,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_REMOTE, ClientIdentification.yacyInternetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); @@ -358,7 +360,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); @@ -389,7 +392,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); @@ -421,7 +425,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, ClientIdentification.browserAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); @@ -452,7 +457,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); @@ -483,7 +489,8 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); @@ -514,7 +521,8 @@ public final class CrawlSwitchboard { CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_SURROGATE, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); @@ -548,7 +556,8 @@ public final class CrawlSwitchboard { CacheStrategy.NOCACHE, collection, ClientIdentification.yacyIntranetCrawlerAgentName, - null); + null, + 0); this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile); this.defaultPushProfiles.put(collection, genericPushProfile); return genericPushProfile; diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index f90b25a7f..5a87a2f10 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -80,6 +80,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String CACHE_STRAGEGY = "cacheStrategy"; public static final String COLLECTIONS = "collections"; public static final String SCRAPER = "scraper"; + public static final String TIMEZONEOFFSET = "timezoneOffset"; public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch"; public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch"; public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch"; @@ -131,6 +132,9 @@ public class CrawlProfile extends ConcurrentHashMap implements M * @param xpstopw true if parent stop words shall be ignored * @param cacheStrategy determines if and how cache is used loading content * @param collections a comma-separated list of tags which are attached to index entries + * @param userAgentName the profile name of the user agent to be used + * @param scraper a scraper for vocabularies + * @param timezoneOffset the time offset in minutes for scraped dates in text without time zone */ public CrawlProfile( String name, @@ -155,7 +159,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M final CacheStrategy cacheStrategy, final String collections, final String userAgentName, - final VocabularyScraper scraper) { + final VocabularyScraper scraper, + final int timezoneOffset) { super(40); if (name == null || name.isEmpty()) { throw new NullPointerException("name must not be null or empty"); @@ -198,6 +203,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M String jsonString = this.scraper.toString(); assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString; put(SCRAPER, jsonString); + put(TIMEZONEOFFSET, timezoneOffset); } /** @@ -623,6 +629,16 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } + public int timezoneOffset() { + final String timezoneOffset = get(TIMEZONEOFFSET); + if (timezoneOffset == null) return 0; + try { + return Integer.parseInt(timezoneOffset); + } catch (NumberFormatException e) { + return 0; + } + } + /** * get a recrawl date for a given age in minutes * @param oldTimeMinutes diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 62962e045..5a9b0c4a1 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -531,7 +531,8 @@ public class CrawlQueues { item.getDescriptions().size() > 0 ? item.getDescriptions().get(0) : "", loaddate, this.sb.crawler.defaultRemoteProfile.handle(), - 0 + 0, + this.sb.crawler.defaultRemoteProfile.timezoneOffset() )); } else { CrawlQueues.log.warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); diff --git a/source/net/yacy/crawler/data/Snapshots.java b/source/net/yacy/crawler/data/Snapshots.java index 40e5fce30..abf8f981e 100644 --- a/source/net/yacy/crawler/data/Snapshots.java +++ b/source/net/yacy/crawler/data/Snapshots.java @@ -359,10 +359,10 @@ public class Snapshots { private static Date parseDate(String d) { try { - return GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d); + return GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d, 0).getTime(); } catch (ParseException e) { try { - return GenericFormatter.SHORT_DAY_FORMATTER.parse(d); + return GenericFormatter.SHORT_DAY_FORMATTER.parse(d, 0).getTime(); } catch (ParseException ee) { return null; } diff --git a/source/net/yacy/crawler/retrieval/Request.java b/source/net/yacy/crawler/retrieval/Request.java index 81bbaa96f..e02b2fdcb 100644 --- a/source/net/yacy/crawler/retrieval/Request.java +++ b/source/net/yacy/crawler/retrieval/Request.java @@ -92,7 +92,8 @@ public class Request extends WorkflowJob private Bitfield flags; private String statusMessage; private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection - + private int timezoneOffset; + public Request() { // used only to create poison entries this.initiator = null; @@ -106,6 +107,7 @@ public class Request extends WorkflowJob this.statusMessage = null; this.initialHash = 0; this.status = 0; + this.timezoneOffset = 0; } /** @@ -115,7 +117,7 @@ public class Request extends WorkflowJob * @param referrerhash */ public Request(final DigestURL url, final byte[] referrerhash) { - this(null, url, referrerhash, null, null, null, 0); + this(null, url, referrerhash, null, null, null, 0, 0); } /** @@ -136,7 +138,8 @@ public class Request extends WorkflowJob final String name, final Date appdate, final String profileHandle, - final int depth) { + final int depth, + final int timezoneOffset) { // create new entry and store it into database assert url != null; assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle @@ -150,6 +153,7 @@ public class Request extends WorkflowJob this.appdate = (appdate == null) ? 0 : appdate.getTime(); this.profileHandle = profileHandle; // must not be null this.depth = depth; + this.timezoneOffset = timezoneOffset; this.flags = new Bitfield(rowdef.width(10)); this.statusMessage = "loaded(args)"; this.initialHash = url.hashCode(); @@ -271,6 +275,10 @@ public class Request extends WorkflowJob // crawl depth where the url appeared return this.depth; } + + public int timezoneOffset() { + return this.timezoneOffset; + } public String profileHandle() { // the handle of the crawl profile diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 615465199..4e1acb6ef 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -28,7 +28,6 @@ package net.yacy.crawler.retrieval; import java.util.Date; -import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; @@ -260,7 +259,7 @@ public class Response { if (docDate == null) docDate = this.responseHeader.date(); } if (docDate == null && this.request != null) docDate = this.request.appdate(); - if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime()); + if (docDate == null) docDate = new Date(); return docDate; } @@ -372,7 +371,7 @@ public class Response { if (date == null) return "stale_no_date_given_in_response"; try { final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl, 8); // milliseconds to live - if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) { + if (System.currentTimeMillis() - date.getTime() > ttl) { //System.out.println("***not indexed because cache-control"); return "stale_expired"; } @@ -461,8 +460,8 @@ public class Response { if (!this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; } // parse date Date d1, d2; - d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); } - d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); } + d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(); } + d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(); } // finally, we shall treat the cache as stale if the modification time is after the if-.. time if (d2.after(d1)) { return false; } } @@ -501,9 +500,10 @@ public class Response { // -expires in cached response // the expires value gives us a very easy hint when the cache is stale final Date expires = this.responseHeader.expires(); + final Date now = new Date(); if (expires != null) { // System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url); - if (expires.before(new Date(GenericFormatter.correctedUTCTime()))) { return false; } + if (expires.before(now)) { return false; } } final Date lastModified = this.responseHeader.lastModified(); cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL); @@ -517,13 +517,13 @@ public class Response { // file may only be treated as fresh for one more month, not more. Date date = this.responseHeader.date(); if (lastModified != null) { - if (date == null) { date = new Date(GenericFormatter.correctedUTCTime()); } + if (date == null) { date = now; } final long age = date.getTime() - lastModified.getTime(); if (age < 0) { return false; } // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10 // the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime() // therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10 - if (GenericFormatter.correctedUTCTime() - date.getTime() > age / 10) { return false; } + if (now.getTime() - date.getTime() > age / 10) { return false; } } // -cache-control in cached response @@ -542,7 +542,7 @@ public class Response { if (date == null) { return false; } try { final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl, 8); // milliseconds to live - if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) { + if (now.getTime() - date.getTime() > ttl) { return false; } } catch (final Exception e) { @@ -626,12 +626,11 @@ public class Response { // -if-modified-since in request // if the page is fresh at the very moment we can index it final Date ifModifiedSince = this.ifModifiedSince(); + final Date now = new Date(); if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) { // parse date Date d = this.responseHeader.lastModified(); - if (d == null) { - d = new Date(GenericFormatter.correctedUTCTime()); - } + if (d == null) d = now; // finally, we shall treat the cache as stale if the modification time is after the if-.. time if (d.after(ifModifiedSince)) { //System.out.println("***not indexed because if-modified-since"); @@ -655,7 +654,7 @@ public class Response { // sometimes, the expires date is set to the past to prevent that a page is cached // we use that information to see if we should index it final Date expires = this.responseHeader.expires(); - if (expires != null && expires.before(new Date(GenericFormatter.correctedUTCTime()))) { + if (expires != null && expires.before(now)) { return "Stale_(Expired)"; } @@ -688,7 +687,7 @@ public class Response { } try { final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl,8); // milliseconds to live - if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) { + if (now.getTime() - date.getTime() > ttl) { //System.out.println("***not indexed because cache-control"); return "Stale_(expired_by_cache-control)"; } @@ -865,7 +864,7 @@ public class Response { final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); try { - return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.depth(), this.content); + return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content); } catch (final Exception e) { return null; } diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index 240f8239d..b28e13f11 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -108,7 +108,8 @@ public class SitemapImporter extends Thread { entry.url(), entry.lastmod(new Date()), this.crawlingProfile.handle(), - 0 + 0, + this.crawlingProfile.timezoneOffset() )); logger.info("New URL '" + entry.url() + "' added for loading."); } diff --git a/source/net/yacy/data/BlogBoard.java b/source/net/yacy/data/BlogBoard.java index c1ec79f15..f97f7c794 100644 --- a/source/net/yacy/data/BlogBoard.java +++ b/source/net/yacy/data/BlogBoard.java @@ -210,7 +210,7 @@ public class BlogBoard { } try { - date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(StrDate); + date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(StrDate, 0).getTime(); } catch (final ParseException e1) { date = new Date(); } @@ -404,7 +404,7 @@ public class BlogBoard { } return new Date(); } - return GenericFormatter.SHORT_SECOND_FORMATTER.parse(date); + return GenericFormatter.SHORT_SECOND_FORMATTER.parse(date, 0).getTime(); } catch (final ParseException ex) { return new Date(); } diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java index c10c144c1..86f17ad90 100644 --- a/source/net/yacy/data/BookmarkHelper.java +++ b/source/net/yacy/data/BookmarkHelper.java @@ -139,7 +139,7 @@ public class BookmarkHelper { final Set tags=ListManager.string2set(tag); //this allow multiple default tags try { //load the links - final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper()); + final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper(), 0); //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(input,writer); @@ -232,7 +232,7 @@ public class BookmarkHelper { Date parsedDate = null; try { - parsedDate = ISO8601Formatter.FORMATTER.parse(time); + parsedDate = ISO8601Formatter.FORMATTER.parse(time, 0).getTime(); } catch (final ParseException e) { parsedDate = new Date(); } diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index c80ff37a3..df5a2939d 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -87,7 +87,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } //get words from document - final Map words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false).words(); + final Map words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false, 0).words(); // generate potential tags from document title, description and subject final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32; diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index b14c10dc9..562a9703f 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -190,7 +190,8 @@ public class YMarkCrawlStart extends HashMap{ CacheStrategy.IFFRESH, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, - null); // TODO: make this a default profile in CrawlSwitchboard + null, + 0); // TODO: make this a default profile in CrawlSwitchboard sb.crawler.putActive(pe.handle().getBytes(), pe); return sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash.getBytes(), @@ -198,7 +199,7 @@ public class YMarkCrawlStart extends HashMap{ null, "CRAWLING-ROOT", new Date(), - pe.handle(), 0 + pe.handle(), 0, pe.timezoneOffset() )); } } diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 2f7c2ffb5..6cf125d17 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -97,7 +97,8 @@ public final class Condenser { final boolean indexMedia, final WordCache meaningLib, final boolean doAutotagging, - final boolean findDatesInContent + final boolean findDatesInContent, + final int timezoneOffset ) { Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging // if addMedia == true, then all the media links are also parsed and added to the words @@ -123,7 +124,7 @@ public final class Condenser { Map.Entry entry; if (indexText) { String text = document.getTextString(); - if (findDatesInContent) this.dates_in_content = DateDetection.parse(text); + if (findDatesInContent) this.dates_in_content = DateDetection.parse(text, timezoneOffset); createCondensement(document.dc_source(), text, meaningLib, doAutotagging, scraper); // the phrase counter: // phrase 0 are words taken from the URL diff --git a/source/net/yacy/document/DateDetection.java b/source/net/yacy/document/DateDetection.java index 9964aedfd..73662ac56 100644 --- a/source/net/yacy/document/DateDetection.java +++ b/source/net/yacy/document/DateDetection.java @@ -499,7 +499,7 @@ public class DateDetection { * @param text * @return a set of dates, ordered by time. first date in the ordered set is the oldest time. */ - public static LinkedHashSet parse(String text) { + public static LinkedHashSet parse(String text, int timezoneOffset) { Long offset; if ((offset = specialDayOffset.get(text)) != null) { LinkedHashSet dates = new LinkedHashSet<>(); dates.add(new Date((System.currentTimeMillis() / AbstractFormatter.dayMillis) * AbstractFormatter.dayMillis + offset.longValue())); return dates; @@ -513,7 +513,7 @@ public class DateDetection { return dates; } - public static Date parseLine(String text) { + public static Date parseLine(final String text, final int timezoneOffset) { Date d = null; try {d = CONFORM.parse(text);} catch (ParseException e) {} //if (d == null) try {d = GenericFormatter.FORMAT_SHORT_DAY.parse(text);} catch (ParseException e) {} // did not work well and fired for wrong formats; do not use @@ -521,7 +521,7 @@ public class DateDetection { if (d == null) try {d = GenericFormatter.FORMAT_ANSIC.parse(text);} catch (ParseException e) {} if (d == null) { - Set dd = parse(text); + Set dd = parse(text, timezoneOffset); if (dd.size() >= 1) d = dd.iterator().next(); } return d; @@ -601,7 +601,7 @@ public class DateDetection { }; long t = System.currentTimeMillis(); for (String s: test) { - String parsed = parse(fill + " " + s + " " + fill).toString(); + String parsed = parse(fill + " " + s + " " + fill, 0).toString(); System.out.println("SOURCE: " + s); System.out.println("DATE : " + parsed); System.out.println(); diff --git a/source/net/yacy/document/Parser.java b/source/net/yacy/document/Parser.java index be7b49eba..b9139340a 100644 --- a/source/net/yacy/document/Parser.java +++ b/source/net/yacy/document/Parser.java @@ -59,6 +59,7 @@ public interface Parser { String mimeType, String charset, VocabularyScraper scraper, + int timezoneOffset, InputStream source ) throws Parser.Failure, InterruptedException; diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 0898f3c35..191793ca0 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -167,6 +167,7 @@ public final class TextParser { final String mimeType, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final int depth, final File sourceFile ) throws InterruptedException, Parser.Failure { @@ -181,7 +182,7 @@ public final class TextParser { throw new Parser.Failure(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); - docs = parseSource(location, mimeType, charset, scraper, depth, sourceFile.length(), sourceStream); + docs = parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -199,6 +200,7 @@ public final class TextParser { String mimeType, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final int depth, final byte[] content ) throws Parser.Failure { @@ -214,7 +216,7 @@ public final class TextParser { } assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true); - Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, content); + Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content); return docs; } @@ -224,6 +226,7 @@ public final class TextParser { String mimeType, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream @@ -244,7 +247,7 @@ public final class TextParser { // then we use only one stream-oriented parser. if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { // use a specific stream-oriented parser - return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, sourceStream); + return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, timezoneOffset, sourceStream); } // in case that we know more parsers we first transform the content into a byte[] and use that as base @@ -255,7 +258,7 @@ public final class TextParser { } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } - Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, b); + Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b); return docs; } @@ -266,6 +269,7 @@ public final class TextParser { final Parser parser, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final InputStream sourceStream ) throws Parser.Failure { if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream"); @@ -275,7 +279,7 @@ public final class TextParser { if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); try { - final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, sourceStream); + final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream); return docs; } catch (final Exception e) { throw new Parser.Failure("parser failed: " + parser.getName(), location); @@ -288,6 +292,7 @@ public final class TextParser { final Set parsers, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final int depth, final byte[] sourceArray ) throws Parser.Failure { @@ -310,7 +315,7 @@ public final class TextParser { bis = new ByteArrayInputStream(sourceArray); } try { - docs = parser.parse(location, mimeType, documentCharset, scraper, bis); + docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis); } catch (final Parser.Failure e) { failedParser.put(parser, e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index 5c44d3dc7..68dbd095a 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -107,7 +107,7 @@ public class DCEntry extends MultiMapSolrParams { if (d == null) return null; if (d.isEmpty()) return null; try { - Date x = ISO8601Formatter.FORMATTER.parse(d); + Date x = ISO8601Formatter.FORMATTER.parse(d, 0).getTime(); Date now = new Date(); return x.after(now) ? now : x; } catch (final ParseException e) { diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 9e6ba1116..b9557f803 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -524,7 +524,7 @@ public class MediawikiImporter extends Thread implements Importer { public void genDocument() throws Parser.Failure { try { this.url = new AnchorURL(this.urlStub + this.title); - final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 1, UTF8.getBytes(this.html)); + final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html)); this.document = Document.mergeDocuments(this.url, "text/html", parsed); // the wiki parser is not able to find the proper title in the source text, so it must be set here this.document.setTitle(this.title); diff --git a/source/net/yacy/document/importer/ResumptionToken.java b/source/net/yacy/document/importer/ResumptionToken.java index 785c12d26..25075410d 100644 --- a/source/net/yacy/document/importer/ResumptionToken.java +++ b/source/net/yacy/document/importer/ResumptionToken.java @@ -158,7 +158,7 @@ public class ResumptionToken extends TreeMap { final String d = get("expirationDate"); if (d == null) return null; try { - return ISO8601Formatter.FORMATTER.parse(d); + return ISO8601Formatter.FORMATTER.parse(d, 0).getTime(); } catch (final ParseException e) { ConcurrentLog.logException(e); return new Date(); diff --git a/source/net/yacy/document/parser/apkParser.java b/source/net/yacy/document/parser/apkParser.java index 0eacb05f6..6df35f26d 100644 --- a/source/net/yacy/document/parser/apkParser.java +++ b/source/net/yacy/document/parser/apkParser.java @@ -54,7 +54,13 @@ public class apkParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { /* * things to discover: diff --git a/source/net/yacy/document/parser/audioTagParser.java b/source/net/yacy/document/parser/audioTagParser.java index 73195c0a0..ed0a386aa 100644 --- a/source/net/yacy/document/parser/audioTagParser.java +++ b/source/net/yacy/document/parser/audioTagParser.java @@ -70,8 +70,13 @@ public class audioTagParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { String filename = location.getFileName(); diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java index 6b78cf0d3..aa4dcf3df 100644 --- a/source/net/yacy/document/parser/augment/AugmentParser.java +++ b/source/net/yacy/document/parser/augment/AugmentParser.java @@ -38,13 +38,19 @@ public class AugmentParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL url, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { - Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, scraper, source); + Document[] htmlDocs = this.rdfaParser.parse(location, mimeType, charset, scraper, timezoneOffset, source); for (final Document doc : htmlDocs) { /* analyze(doc, url, mimeType, charset); // enrich document text */ - parseAndAugment(doc, url, mimeType, charset); // enrich document with additional tags + parseAndAugment(doc, location, mimeType, charset); // enrich document with additional tags } return htmlDocs; } diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 4d2c9dd6f..4e16fbfce 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -57,8 +57,13 @@ public class bzipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; @@ -95,7 +100,7 @@ public class bzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile); + docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java index 717aadf2b..25bba2fff 100644 --- a/source/net/yacy/document/parser/csvParser.java +++ b/source/net/yacy/document/parser/csvParser.java @@ -53,7 +53,13 @@ public class csvParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { // construct a document using all cells of the document // the first row is used as headline // all lines are artificially terminated by a '.' to separate them as sentence for the condenser. diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 6d3e74fd8..a33844382 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -59,8 +59,13 @@ public class docParser extends AbstractParser implements Parser { @SuppressWarnings("deprecation") @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { final WordExtractor extractor; diff --git a/source/net/yacy/document/parser/dwgParser.java b/source/net/yacy/document/parser/dwgParser.java index 66b902eeb..25c2d29b6 100644 --- a/source/net/yacy/document/parser/dwgParser.java +++ b/source/net/yacy/document/parser/dwgParser.java @@ -61,7 +61,13 @@ public class dwgParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, true)) diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index 53e6e46cb..2ff09475d 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -46,8 +46,13 @@ public class genericParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source1) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { String filename = location.getFileName(); final Document[] docs = new Document[]{new Document( diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 5a57e219a..58f788f37 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -56,7 +56,13 @@ public class gzipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; Document[] docs = null; @@ -80,7 +86,7 @@ public class gzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile); + docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 244dad876..17f9362c7 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -188,6 +188,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private AnchorURL canonical, publisher; private final int maxLinks; private final VocabularyScraper vocabularyScraper; + private final int timezoneOffset; private int breadcrumbs; @@ -213,7 +214,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { * @param classDetector a map from class names to vocabulary names to scrape content from the DOM with associated class name */ @SuppressWarnings("unchecked") - public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper) { + public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); @@ -221,6 +222,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.root = root; this.maxLinks = maxLinks; this.vocabularyScraper = vocabularyScraper; + this.timezoneOffset = timezoneOffset; this.evaluationScores = new Evaluation(); this.rss = new SizeLimitedMap(maxLinks); this.css = new SizeLimitedMap(maxLinks); @@ -389,12 +391,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (content != null) { if ("startDate".equals(itemprop)) try { // parse ISO 8601 date - Date startDate = ISO8601Formatter.FORMATTER.parse(content); + Date startDate = ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime(); this.startDates.add(startDate); } catch (ParseException e) {} if ("endDate".equals(itemprop)) try { // parse ISO 8601 date - Date endDate = ISO8601Formatter.FORMATTER.parse(content); + Date endDate = ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime(); this.endDates.add(endDate); } catch (ParseException e) {} } @@ -651,7 +653,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // start a new scraper to parse links inside this text // parsing the content - final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper); + final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper, this.timezoneOffset); final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false); try { FileUtils.copy(new CharArrayReader(inlineHtml), writer); @@ -1003,19 +1005,19 @@ public class ContentScraper extends AbstractScraper implements Scraper { // content = this.metas.get("date"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} // content = this.metas.get("dc.date"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} // content = this.metas.get("dc:date"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} // content = this.metas.get("last-modified"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} return new Date(); } @@ -1153,19 +1155,19 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } - public static ContentScraper parseResource(final File file, final int maxLinks) throws IOException { + public static ContentScraper parseResource(final File file, final int maxLinks, final int timezoneOffset) throws IOException { // load page final byte[] page = FileUtils.read(file); if (page == null) throw new IOException("no content in file " + file.toString()); // scrape document to look up charset - final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks); + final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks, timezoneOffset); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); htmlFilter.close(); if (charset == null) charset = Charset.defaultCharset().toString(); // scrape content - final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper()); + final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper(), timezoneOffset); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); writer.close(); diff --git a/source/net/yacy/document/parser/html/ScraperInputStream.java b/source/net/yacy/document/parser/html/ScraperInputStream.java index b63a56cc4..ae681f97f 100644 --- a/source/net/yacy/document/parser/html/ScraperInputStream.java +++ b/source/net/yacy/document/parser/html/ScraperInputStream.java @@ -64,13 +64,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener { final DigestURL rooturl, final Transformer transformer, final boolean passbyIfBinarySuspect, - final int maxLinks + final int maxLinks, + final int timezoneOffset ) { // create a input stream for buffereing this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize); this.bufferedIn.mark((int) preBufferSize); - final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper); + final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper, timezoneOffset); scraper.registerHtmlFilterEventListener(this); try { diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index db1cf3a23..654716e63 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -87,13 +87,15 @@ public class htmlParser extends AbstractParser implements Parser { public Document[] parse( final AnchorURL location, final String mimeType, - final String documentCharset, final VocabularyScraper vocscraper, + final String documentCharset, + final VocabularyScraper vocscraper, + final int timezoneOffset, final InputStream sourceStream) throws Parser.Failure, InterruptedException { try { // first get a document from the parsed html Charset[] detectedcharsetcontainer = new Charset[]{null}; - final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, sourceStream, maxLinks); + final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); // parseToScraper also detects/corrects/sets charset from html content tag final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); @@ -151,7 +153,7 @@ public class htmlParser extends AbstractParser implements Parser { return ppd; } - public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, String input, int maxLinks) throws IOException { + public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxLinks) throws IOException { Charset[] detectedcharsetcontainer = new Charset[]{null}; InputStream sourceStream; try { @@ -161,7 +163,7 @@ public class htmlParser extends AbstractParser implements Parser { } ContentScraper scraper; try { - scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, sourceStream, maxLinks); + scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); } catch (Failure e) { throw new IOException(e.getMessage()); } @@ -173,6 +175,7 @@ public class htmlParser extends AbstractParser implements Parser { final String documentCharset, final VocabularyScraper vocabularyScraper, Charset[] detectedcharsetcontainer, + final int timezoneOffset, InputStream sourceStream, final int maxLinks) throws Parser.Failure, IOException { @@ -188,7 +191,7 @@ public class htmlParser extends AbstractParser implements Parser { if (charset == null) { ScraperInputStream htmlFilter = null; try { - htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks); + htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks, timezoneOffset); sourceStream = htmlFilter; charset = htmlFilter.detectCharset(); } catch (final IOException e1) { @@ -222,7 +225,7 @@ public class htmlParser extends AbstractParser implements Parser { } // parsing the content - final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper); + final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); try { FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]); @@ -324,7 +327,7 @@ public class htmlParser extends AbstractParser implements Parser { try { url = new AnchorURL(args[0]); final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null); - final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), new ByteArrayInputStream(content)); + final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), 0, new ByteArrayInputStream(content)); final String title = document[0].dc_title(); System.out.println(title); } catch (final MalformedURLException e) { diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index db08ac783..4f69b7eb6 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -93,8 +93,10 @@ public class genericImageParser extends AbstractParser implements Parser { public Document[] parse( final AnchorURL location, final String mimeType, - final String documentCharset, final VocabularyScraper scraper, - final InputStream sourceStream) throws Parser.Failure, InterruptedException { + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { ImageInfo ii = null; String title = null; @@ -108,7 +110,7 @@ public class genericImageParser extends AbstractParser implements Parser { if (mimeType.equals("image/bmp") || ext.equals("bmp")) { byte[] b; try { - b = FileUtils.read(sourceStream); + b = FileUtils.read(source); } catch (final IOException e) { ConcurrentLog.logException(e); throw new Parser.Failure(e.getMessage(), location); @@ -126,7 +128,7 @@ public class genericImageParser extends AbstractParser implements Parser { // a tutorial is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/sampleUsage.html byte[] b; try { - b = FileUtils.read(sourceStream); + b = FileUtils.read(source); } catch (final IOException e) { ConcurrentLog.logException(e); throw new Parser.Failure(e.getMessage(), location); @@ -182,7 +184,7 @@ public class genericImageParser extends AbstractParser implements Parser { // just ignore } } else { - ii = parseJavaImage(location, sourceStream); + ii = parseJavaImage(location, source); } final HashSet languages = new HashSet(); @@ -315,7 +317,7 @@ public class genericImageParser extends AbstractParser implements Parser { AnchorURL uri; try { uri = new AnchorURL("http://localhost/" + image.getName()); - final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), new FileInputStream(image)); + final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), 0, new FileInputStream(image)); System.out.println(document[0].toString()); } catch (final MalformedURLException e) { e.printStackTrace(); diff --git a/source/net/yacy/document/parser/images/metadataImageParser.java b/source/net/yacy/document/parser/images/metadataImageParser.java index eef448faf..04b20b948 100644 --- a/source/net/yacy/document/parser/images/metadataImageParser.java +++ b/source/net/yacy/document/parser/images/metadataImageParser.java @@ -87,8 +87,10 @@ public class metadataImageParser extends AbstractParser implements Parser { public Document[] parse( final AnchorURL location, final String mimeType, - final String documentCharset, final VocabularyScraper scraper, - final InputStream sourceStream) throws Parser.Failure, InterruptedException { + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { String title = null; String author = null; @@ -99,7 +101,7 @@ public class metadataImageParser extends AbstractParser implements Parser { StringBuilder imgInfotxt = new StringBuilder(); try { - final Metadata metadata = ImageMetadataReader.readMetadata(new BufferedInputStream(sourceStream)); + final Metadata metadata = ImageMetadataReader.readMetadata(new BufferedInputStream(source)); final Iterator directories = metadata.getDirectories().iterator(); final HashMap props = new HashMap(); @@ -160,7 +162,7 @@ public class metadataImageParser extends AbstractParser implements Parser { return new Document[]{new Document( location, mimeType, - documentCharset, + charset, this, new HashSet(0), // languages keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords diff --git a/source/net/yacy/document/parser/linkScraperParser.java b/source/net/yacy/document/parser/linkScraperParser.java index 4c0abbdd4..f0ccbe4d9 100644 --- a/source/net/yacy/document/parser/linkScraperParser.java +++ b/source/net/yacy/document/parser/linkScraperParser.java @@ -59,11 +59,16 @@ public class linkScraperParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("text/sgml"); } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { - Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, source); + Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, timezoneOffset, source); Document htmlParserDoc = htmlParserDocs == null ? null : Document.mergeDocuments(location, mimeType, htmlParserDocs); diff --git a/source/net/yacy/document/parser/mmParser.java b/source/net/yacy/document/parser/mmParser.java index 0781eea3c..686b9cddb 100644 --- a/source/net/yacy/document/parser/mmParser.java +++ b/source/net/yacy/document/parser/mmParser.java @@ -71,8 +71,13 @@ public class mmParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { final StringBuilder sb = new StringBuilder(); diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 588d1432d..2f574f0c0 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -216,7 +216,13 @@ public class odtParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 6535c95ed..9072938f4 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -202,7 +202,13 @@ public class ooxmlParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 52df35bba..1a526a6f5 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -86,7 +86,13 @@ public class pdfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) @@ -376,7 +382,7 @@ public class pdfParser extends AbstractParser implements Parser { final AbstractParser parser = new pdfParser(); Document document = null; try { - document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), new FileInputStream(pdfFile))); + document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), 0, new FileInputStream(pdfFile))); } catch (final Parser.Failure e) { System.err.println("Cannot parse file " + pdfFile.getAbsolutePath()); ConcurrentLog.logException(e); diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index 0f793b0f2..f05cf8dec 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -64,8 +64,13 @@ public class pptParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { try { /* diff --git a/source/net/yacy/document/parser/psParser.java b/source/net/yacy/document/parser/psParser.java index 09cda757e..e25f6439c 100644 --- a/source/net/yacy/document/parser/psParser.java +++ b/source/net/yacy/document/parser/psParser.java @@ -258,8 +258,13 @@ public class psParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; diff --git a/source/net/yacy/document/parser/rdfParser.java b/source/net/yacy/document/parser/rdfParser.java index 6f3b6fee8..dba55415b 100644 --- a/source/net/yacy/document/parser/rdfParser.java +++ b/source/net/yacy/document/parser/rdfParser.java @@ -46,8 +46,13 @@ public class rdfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Failure, InterruptedException { @@ -60,7 +65,7 @@ public class rdfParser extends AbstractParser implements Parser { Document doc; String all = "rdfdatasource"; - doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "", + doc = new Document(location, mimeType, charset, null, null, null, singleList(""), "", "", null, new ArrayList(0), 0, 0, all, null, null, null, false, new Date()); docs.add(doc); diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java index 2a36f962d..f95cca2ae 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java @@ -48,11 +48,16 @@ public class RDFaParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL url, String mimeType, - String charset, final VocabularyScraper scraper, InputStream source) throws Failure, + public Document[] parse( + final AnchorURL url, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Failure, InterruptedException { - Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, source); + Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, timezoneOffset, source); // TODO: current hardcoded restriction: apply rdfa parser only on selected sources. @@ -97,13 +102,18 @@ public class RDFaParser extends AbstractParser implements Parser { return doc; } - private Document[] parseHtml(AnchorURL url, String mimeType, - String charset, VocabularyScraper scraper, InputStream source) throws Failure, + private Document[] parseHtml( + final AnchorURL url, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Failure, InterruptedException { Document[] htmlDocs = null; try { - htmlDocs = this.hp.parse(url, mimeType, charset, scraper, source); + htmlDocs = this.hp.parse(url, mimeType, charset, scraper, timezoneOffset, source); source.reset(); } catch (final IOException e1) { @@ -180,7 +190,7 @@ public class RDFaParser extends AbstractParser implements Parser { if (aReader != null) { RDFaParser aParser = new RDFaParser(); try { - aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), aURL.openStream()); + aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), 0, aURL.openStream()); } catch (final FileNotFoundException e) { e.printStackTrace(); } catch (final IOException e) { diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index f58a14441..7005e85fe 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -59,14 +59,19 @@ public class rssParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL feedurl, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Failure, InterruptedException { RSSReader rssReader; try { rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source); } catch (final IOException e) { - throw new Parser.Failure("Load error:" + e.getMessage(), feedurl, e); + throw new Parser.Failure("Load error:" + e.getMessage(), location, e); } final RSSFeed feed = rssReader.getFeed(); diff --git a/source/net/yacy/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java index 06d7bd5ee..e6ea7d334 100644 --- a/source/net/yacy/document/parser/rtfParser.java +++ b/source/net/yacy/document/parser/rtfParser.java @@ -53,8 +53,13 @@ public class rtfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { try { diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index 5c22533aa..ddfdd8153 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -56,7 +56,12 @@ public class sevenzipParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed"); } - public Document parse(final AnchorURL location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException { + public Document parse( + final AnchorURL location, + final String mimeType, + final String charset, + final int timezoneOffset, + final IInStream source) throws Parser.Failure, InterruptedException { final Document doc = new Document( location, mimeType, @@ -83,7 +88,7 @@ public class sevenzipParser extends AbstractParser implements Parser { } catch (final IOException e) { throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location); } - final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile()); + final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), timezoneOffset); AbstractParser.log.fine("processing archive contents..."); try { archive.Extract(null, -1, 0, aec); @@ -101,16 +106,27 @@ public class sevenzipParser extends AbstractParser implements Parser { } } - public Document parse(final AnchorURL location, final String mimeType, final String charset, final byte[] source) throws Parser.Failure, InterruptedException { - return parse(location, mimeType, charset, new ByteArrayIInStream(source)); + public Document parse( + final AnchorURL location, + final String mimeType, + final String charset, + final int timezoneOffset, + final byte[] source) throws Parser.Failure, InterruptedException { + return parse(location, mimeType, charset, timezoneOffset, new ByteArrayIInStream(source)); } @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { try { final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); FileUtils.copy(source, cfos); - return new Document[]{parse(location, mimeType, charset, cfos.toByteArray())}; + return new Document[]{parse(location, mimeType, charset, timezoneOffset, cfos.toByteArray())}; } catch (final IOException e) { throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location); } @@ -124,13 +140,19 @@ public class sevenzipParser extends AbstractParser implements Parser { private ByteArrayOutputStream cfos = null; private final Document doc; private final String prefix; + private final int timezoneOffset; - public SZParserExtractCallback(final ConcurrentLog logger, final IInArchive handler, - final Document doc, final String prefix) { + public SZParserExtractCallback( + final ConcurrentLog logger, + final IInArchive handler, + final Document doc, + final String prefix, + final int timezoneOffset) { super.Init(handler); this.log = logger; this.doc = doc; this.prefix = prefix; + this.timezoneOffset = timezoneOffset; } @Override @@ -172,7 +194,7 @@ public class sevenzipParser extends AbstractParser implements Parser { // below for reversion of the effects final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); - theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), this.doc.getDepth() + 1, this.cfos.toByteArray()); + theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray()); this.doc.addSubDocuments(theDocs); } diff --git a/source/net/yacy/document/parser/sidAudioParser.java b/source/net/yacy/document/parser/sidAudioParser.java index 4f1cbf5c1..1eb216a3b 100644 --- a/source/net/yacy/document/parser/sidAudioParser.java +++ b/source/net/yacy/document/parser/sidAudioParser.java @@ -58,8 +58,13 @@ public class sidAudioParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { try { final int available = source.available(); diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index ecc5eb393..11742179f 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -70,8 +70,13 @@ public class sitemapParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Failure, InterruptedException { final List docs = new ArrayList(); SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent); @@ -83,7 +88,7 @@ public class sitemapParser extends AbstractParser implements Parser { uri = new DigestURL(item.loc); doc = new Document( uri, - TextParser.mimeOf(url), + TextParser.mimeOf(location), charset, this, null, @@ -224,7 +229,7 @@ public class sitemapParser extends AbstractParser implements Parser { public Date lastmod(final Date dflt) { try { - return ISO8601Formatter.FORMATTER.parse(this.lastmod); + return ISO8601Formatter.FORMATTER.parse(this.lastmod, 0).getTime(); } catch (final ParseException e) { return dflt; } @@ -245,7 +250,7 @@ public class sitemapParser extends AbstractParser implements Parser { public Date lastmod(final Date dflt) { try { - return ISO8601Formatter.FORMATTER.parse(this.lastmod); + return ISO8601Formatter.FORMATTER.parse(this.lastmod, 0).getTime(); } catch (final ParseException e) { return dflt; } diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index ac1c9c2ce..502782b3b 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -56,8 +56,13 @@ public class swfParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index e9bdb96bc..52a84e296 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -62,16 +62,22 @@ public class tarParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + InputStream source) throws Parser.Failure, InterruptedException { final List docacc = new ArrayList(); Document[] subDocs = null; - final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); + final String ext = MultiProtocolURL.getFileExtension(location.getFileName()); if (ext.equals("gz") || ext.equals("tgz")) { try { source = new GZIPInputStream(source); } catch (final IOException e) { - throw new Parser.Failure("tar parser: " + e.getMessage(), url); + throw new Parser.Failure("tar parser: " + e.getMessage(), location); } } TarEntry entry; @@ -91,7 +97,7 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, scraper, 999, tmp); + subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index abe9caed4..3b096ebf1 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -57,7 +57,13 @@ public class torrentParser extends AbstractParser implements Parser { } @Override - public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { byte[] b = null; try { @@ -120,8 +126,8 @@ public class torrentParser extends AbstractParser implements Parser { try { byte[] b = FileUtils.read(new File(args[0])); torrentParser parser = new torrentParser(); - Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), new ByteArrayInputStream(b)); - Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false); + Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), 0, new ByteArrayInputStream(b)); + Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false, 0); Map w = c.words(); for (Map.Entry e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); } catch (final IOException e) { diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 107e89feb..f4c4120e2 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -66,7 +66,13 @@ public class vcfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { try { @@ -201,7 +207,7 @@ public class vcfParser extends AbstractParser implements Parser { } else { if (AbstractParser.log.isFinest()) AbstractParser.log.finest("Invalid data in vcf file" + - "\n\tURL: " + url + + "\n\tURL: " + location + "\n\tLine: " + line + "\n\tLine-Nr: " + lineNr); } @@ -212,7 +218,7 @@ public class vcfParser extends AbstractParser implements Parser { final byte[] text = UTF8.getBytes(parsedDataText.toString()); final List descriptions = new ArrayList(1); descriptions.add("vCard"); return new Document[]{new Document( - url, // url of the source document + location, // url of the source document mimeType, // the documents mime type null, // charset this, @@ -234,7 +240,7 @@ public class vcfParser extends AbstractParser implements Parser { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; - throw new Parser.Failure("Unexpected error while parsing vcf resource. " + e.getMessage(),url); + throw new Parser.Failure("Unexpected error while parsing vcf resource. " + e.getMessage(), location); } } diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java index 9e53f1085..16290f363 100644 --- a/source/net/yacy/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -67,7 +67,13 @@ public class vsdParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { Document theDoc = null; diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java index 40c925493..cf178c85e 100644 --- a/source/net/yacy/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -68,8 +68,13 @@ public class xlsParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final AnchorURL location, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { return new XLSHSSFListener().parse(location, mimeType, charset, source); } diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index 2438354f1..a924a6e03 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -62,12 +62,17 @@ public class zipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final AnchorURL url, final String mimeType, - final String charset, final VocabularyScraper scraper, final InputStream source) + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) - throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), url); + throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), location); Document[] docs = null; final List docacc = new ArrayList(); @@ -88,9 +93,9 @@ public class zipParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(zis, tmp, entry.getSize()); - final DigestURL virtualURL = DigestURL.newURL(url, "#" + name); + final DigestURL virtualURL = DigestURL.newURL(location, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); - docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, 999, tmp); + docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp); if (docs == null) continue; for (final Document d: docs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/http/ProxyCacheHandler.java b/source/net/yacy/http/ProxyCacheHandler.java index d5417b641..50a200025 100644 --- a/source/net/yacy/http/ProxyCacheHandler.java +++ b/source/net/yacy/http/ProxyCacheHandler.java @@ -74,7 +74,8 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler "", cachedResponseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), - 0); + 0, + sb.crawler.defaultProxyProfile.timezoneOffset()); final Response cachedResponse = new Response( yacyRequest, diff --git a/source/net/yacy/http/ProxyHandler.java b/source/net/yacy/http/ProxyHandler.java index 2658e031a..d558aac0a 100644 --- a/source/net/yacy/http/ProxyHandler.java +++ b/source/net/yacy/http/ProxyHandler.java @@ -180,7 +180,8 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler { "", responseHeaderLegacy.lastModified(), sb.crawler.defaultProxyProfile.handle(), - 0); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); + 0, + sb.crawler.defaultProxyProfile.timezoneOffset()); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); final Response yacyResponse = new Response( yacyRequest, null, diff --git a/source/net/yacy/http/servlets/SolrSelectServlet.java b/source/net/yacy/http/servlets/SolrSelectServlet.java index 36fc7aa80..bba3de81f 100644 --- a/source/net/yacy/http/servlets/SolrSelectServlet.java +++ b/source/net/yacy/http/servlets/SolrSelectServlet.java @@ -137,7 +137,7 @@ public class SolrSelectServlet extends HttpServlet { if (!mmsp.getMap().containsKey(CommonParams.Q) && mmsp.getMap().containsKey(CommonParams.QUERY)) { querystring = mmsp.get(CommonParams.QUERY, ""); mmsp.getMap().remove(CommonParams.QUERY); - QueryModifier modifier = new QueryModifier(); + QueryModifier modifier = new QueryModifier(0); querystring = modifier.parse(querystring); modifier.apply(mmsp); QueryGoal qg = new QueryGoal(querystring); diff --git a/source/net/yacy/kelondro/blob/ArrayStack.java b/source/net/yacy/kelondro/blob/ArrayStack.java index bf2e1d781..f577cc119 100644 --- a/source/net/yacy/kelondro/blob/ArrayStack.java +++ b/source/net/yacy/kelondro/blob/ArrayStack.java @@ -172,7 +172,7 @@ public class ArrayStack implements BLOB { f.delete(); deletions = true; } else try { - d = GenericFormatter.SHORT_SECOND_FORMATTER.parse(file.substring(0, 14)); + d = GenericFormatter.SHORT_SECOND_FORMATTER.parse(file.substring(0, 14), 0).getTime(); f.renameTo(newBLOB(d)); deletions = true; } catch (final ParseException e) {continue;} @@ -188,7 +188,7 @@ public class ArrayStack implements BLOB { for (final String file : files) { if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) { try { - d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18)); + d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime(); time = d.getTime(); if (time > maxtime) maxtime = time; } catch (final ParseException e) {continue;} @@ -199,7 +199,7 @@ public class ArrayStack implements BLOB { for (final String file : files) { if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) { try { - d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18)); + d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime(); f = new File(heapLocation, file); time = d.getTime(); try { @@ -253,7 +253,7 @@ public class ArrayStack implements BLOB { public synchronized void mountBLOB(final File location, final boolean full) throws IOException { Date d; try { - d = my_SHORT_MILSEC_FORMATTER.parse(location.getName().substring(this.prefix.length() + 1, this.prefix.length() + 18)); + d = my_SHORT_MILSEC_FORMATTER.parse(location.getName().substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime(); } catch (final ParseException e) { throw new IOException("date parse problem with file " + location.toString() + ": " + e.getMessage()); } diff --git a/source/net/yacy/kelondro/blob/BEncodedHeapBag.java b/source/net/yacy/kelondro/blob/BEncodedHeapBag.java index 3b7ae63c9..1c55cb8d3 100644 --- a/source/net/yacy/kelondro/blob/BEncodedHeapBag.java +++ b/source/net/yacy/kelondro/blob/BEncodedHeapBag.java @@ -95,7 +95,7 @@ public class BEncodedHeapBag extends AbstractMapStore implements MapStore { (element.length() == this.prefix.length() + 23)) { f = new File(this.baseDir, element); try { - d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18)); + d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime(); } catch (final ParseException e) { ConcurrentLog.severe("BEncodedHeapBag", "", e); continue; @@ -203,7 +203,7 @@ public class BEncodedHeapBag extends AbstractMapStore implements MapStore { final String name = heap.getFile().getName(); long d; try { - d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18)).getTime(); + d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime().getTime(); } catch (final ParseException e) { ConcurrentLog.severe("BEncodedHeapBag", "", e); d = 0; diff --git a/source/net/yacy/kelondro/blob/Tables.java b/source/net/yacy/kelondro/blob/Tables.java index 7dc399db8..bd9de5329 100644 --- a/source/net/yacy/kelondro/blob/Tables.java +++ b/source/net/yacy/kelondro/blob/Tables.java @@ -764,7 +764,7 @@ public class Tables implements Iterable { final byte[] r = this.get(colname); if (r == null) return dflt; try { - return my_SHORT_MILSEC_FORMATTER.parse(UTF8.String(r)); + return my_SHORT_MILSEC_FORMATTER.parse(UTF8.String(r), 0).getTime(); } catch (final ParseException e) { return dflt; } diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 17f2c772d..cbe6ccc52 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -107,17 +107,17 @@ public class URIMetadataNode extends SolrDocument { final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); try { - this.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"))); + this.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"), 0).getTime()); } catch (final ParseException e) { this.setField(CollectionSchema.last_modified.name(), new Date()); } try { - this.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"))); + this.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"), 0).getTime()); } catch (final ParseException e) { this.setField(CollectionSchema.load_date_dt.name(), new Date()); } try { - this.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"))); + this.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"), 0).getTime()); } catch (final ParseException e) { this.setField(CollectionSchema.fresh_date_dt.name(), new Date()); } diff --git a/source/net/yacy/kelondro/table/SplitTable.java b/source/net/yacy/kelondro/table/SplitTable.java index a70c0ff1f..ca8bbf90e 100644 --- a/source/net/yacy/kelondro/table/SplitTable.java +++ b/source/net/yacy/kelondro/table/SplitTable.java @@ -179,7 +179,7 @@ public class SplitTable implements Index, Iterable { (element.length() == this.prefix.length() + 24)) { f = new File(this.path, element); try { - d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18)); + d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime(); } catch (final ParseException e) { ConcurrentLog.severe("SplitTable", "", e); continue; @@ -372,7 +372,7 @@ public class SplitTable implements Index, Iterable { final String name = new File(table.filename()).getName(); long d; try { - d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18)).getTime(); + d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime().getTime(); } catch (final ParseException e) { ConcurrentLog.severe("SplitTable", "", e); d = 0; diff --git a/source/net/yacy/peers/NewsDB.java b/source/net/yacy/peers/NewsDB.java index f6926d512..e2dbbde26 100644 --- a/source/net/yacy/peers/NewsDB.java +++ b/source/net/yacy/peers/NewsDB.java @@ -46,6 +46,8 @@ package net.yacy.peers; import java.io.File; import java.io.IOException; +import java.text.ParseException; +import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.Iterator; @@ -164,10 +166,16 @@ public class NewsDB { private Record b2r(final Row.Entry b) { if (b == null) return null; + Calendar c; + try { + c = b.empty(2) ? null : my_SHORT_SECOND_FORMATTER.parse(b.getColASCII(2), 0); + } catch (ParseException e) { + c = null; + } return new NewsDB.Record( b.getPrimaryKeyASCII(), b.getColUTF8(1), - (b.empty(2)) ? null : my_SHORT_SECOND_FORMATTER.parse(b.getColASCII(2), GenericFormatter.UTCDiffString()), + c == null ? null : c.getTime(), (int) b.getColLong(3), MapTools.string2map(b.getColUTF8(4), ",") ); @@ -226,8 +234,8 @@ public class NewsDB { public class Record { private final String originator; // hash of originating peer - private final Date created; // Date when news was created by originator - private final Date received; // Date when news was received here at this peer + private Date created; // Date when news was created by originator + private Date received; // Date when news was received here at this peer private final String category; // keyword that addresses possible actions private int distributed; // counter that counts number of distributions of this news record private final Map attributes; // elements of the news for a special category @@ -238,8 +246,16 @@ public class NewsDB { if (this.attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + this.attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")"); this.category = (this.attributes.containsKey("cat")) ? this.attributes.get("cat") : ""; if (this.category.length() > NewsDB.categoryStringLength) throw new IllegalArgumentException("category length (" + this.category.length() + ") exceeds maximum (" + NewsDB.categoryStringLength + ")"); - this.received = (this.attributes.containsKey("rec")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("rec"), GenericFormatter.UTCDiffString()) : new Date(); - this.created = (this.attributes.containsKey("cre")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("cre"), GenericFormatter.UTCDiffString()) : new Date(); + try { + this.received = (this.attributes.containsKey("rec")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("rec"), 0).getTime() : new Date(); + } catch (ParseException e) { + this.received = new Date(); + } + try { + this.created = (this.attributes.containsKey("cre")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("cre"), 0).getTime() : new Date(); + } catch (ParseException e) { + this.created = new Date(); + } this.distributed = (this.attributes.containsKey("dis")) ? Integer.parseInt(this.attributes.get("dis")) : 0; this.originator = (this.attributes.containsKey("ori")) ? this.attributes.get("ori") : ""; removeStandards(); @@ -262,7 +278,11 @@ public class NewsDB { if (attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")"); this.attributes = attributes; this.received = received; - this.created = my_SHORT_SECOND_FORMATTER.parse(id.substring(0, GenericFormatter.PATTERN_SHORT_SECOND.length()), GenericFormatter.UTCDiffString()); + try { + this.created = my_SHORT_SECOND_FORMATTER.parse(id.substring(0, GenericFormatter.PATTERN_SHORT_SECOND.length()), 0).getTime(); + } catch (ParseException e) { + this.created = new Date(); + } this.category = category; this.distributed = distributed; this.originator = id.substring(GenericFormatter.PATTERN_SHORT_SECOND.length()); diff --git a/source/net/yacy/peers/Seed.java b/source/net/yacy/peers/Seed.java index 64c5f9938..c7e44bf79 100644 --- a/source/net/yacy/peers/Seed.java +++ b/source/net/yacy/peers/Seed.java @@ -797,7 +797,7 @@ public class Seed implements Cloneable, Comparable, Comparator try { final GenericFormatter my_SHORT_SECOND_FORMATTER = new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); // use our own formatter to prevent concurrency locks with other processes - final long t = my_SHORT_SECOND_FORMATTER.parse(get(Seed.LASTSEEN, "20040101000000")).getTime(); + final long t = my_SHORT_SECOND_FORMATTER.parse(get(Seed.LASTSEEN, "20040101000000"), 0).getTime().getTime(); // getTime creates a UTC time number. But in this case java thinks, that the given // time string is a local time, which has a local UTC offset applied. // Therefore java subtracts the local UTC offset, to get a UTC number. @@ -831,7 +831,7 @@ public class Seed implements Cloneable, Comparable, Comparator try { final GenericFormatter my_SHORT_SECOND_FORMATTER = new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); // use our own formatter to prevent concurrency locks with other processes - b = my_SHORT_SECOND_FORMATTER.parse(get(Seed.BDATE, "20040101000000")).getTime(); + b = my_SHORT_SECOND_FORMATTER.parse(get(Seed.BDATE, "20040101000000"), 0).getTime().getTime(); } catch (final ParseException e ) { b = System.currentTimeMillis(); } diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index d6b7f3139..5c3bea554 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -503,7 +503,7 @@ public class WebStructureGraph { hr = new HostReference( ASCII.getBytes(sentry.hosthash), - GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(), + GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date, 0).getTime().getTime(), refhosthashandcounter.getValue().intValue()); } catch (final ParseException e ) { continue refloop; diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 84a01a08d..1da658f65 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -112,21 +112,24 @@ public final class LoaderDispatcher { final boolean forText, final boolean global ) { + CrawlProfile profile = + (forText) ? + ((global) ? + this.sb.crawler.defaultTextSnippetGlobalProfile : + this.sb.crawler.defaultTextSnippetLocalProfile) + : + ((global) ? + this.sb.crawler.defaultMediaSnippetGlobalProfile : + this.sb.crawler.defaultMediaSnippetLocalProfile); return new Request( ASCII.getBytes(this.sb.peers.mySeed().hash), url, null, "", new Date(), - (forText) ? - ((global) ? - this.sb.crawler.defaultTextSnippetGlobalProfile.handle() : - this.sb.crawler.defaultTextSnippetLocalProfile.handle()) - : - ((global) ? - this.sb.crawler.defaultMediaSnippetGlobalProfile.handle() : - this.sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile - 0); + profile.handle(), + 0, + profile.timezoneOffset()); } public void load(final DigestURL url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException { @@ -407,7 +410,7 @@ public final class LoaderDispatcher { * @return a map from URLs to the anchor texts of the urls * @throws IOException */ - public final Map loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { + public final Map loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent, final int timezoneOffset) throws IOException { final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); @@ -418,7 +421,7 @@ public final class LoaderDispatcher { final String supportError = TextParser.supports(url, responseHeader.mime()); if (supportError != null) throw new IOException("no parser support: " + supportError); try { - documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), response.depth(), response.getContent()); + documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent()); if (documents == null) throw new IOException("document == null"); } catch (final Exception e) { throw new IOException("parser error: " + e.getMessage()); diff --git a/source/net/yacy/search/EventTracker.java b/source/net/yacy/search/EventTracker.java index 2479e285c..bba8b335f 100644 --- a/source/net/yacy/search/EventTracker.java +++ b/source/net/yacy/search/EventTracker.java @@ -152,7 +152,7 @@ public class EventTracker { } public long getTime() { if (this.time instanceof String) try { - return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time).getTime(); + return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time, 0).getTime().getTime(); } catch (ParseException e) { return -1L; } @@ -162,7 +162,7 @@ public class EventTracker { } public Date getDate() { if (this.time instanceof String) try { - return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time); + return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time, 0).getTime(); } catch (ParseException e) { return null; }if (this.time instanceof Long) return new Date((Long) this.time); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index ad9724d44..c1b29eb95 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1942,7 +1942,8 @@ public final class Switchboard extends serverSwitch { "", surrogate.getDate(), this.crawler.defaultSurrogateProfile.handle(), - 0); + 0, + this.crawler.defaultSurrogateProfile.timezoneOffset()); response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false, null); final IndexingQueueEntry queueEntry = new IndexingQueueEntry(response, new Document[] {document}, null); @@ -2571,6 +2572,7 @@ public final class Switchboard extends serverSwitch { response.getMimeType(), response.getCharacterEncoding(), response.profile().scraper(), + response.profile().timezoneOffset(), response.depth(), response.getContent()); if ( documents == null ) { @@ -2673,7 +2675,8 @@ public final class Switchboard extends serverSwitch { nextEntry.getValue(), new Date(), response.profile().handle(), - nextdepth)); + nextdepth, + response.profile().timezoneOffset())); } catch (final MalformedURLException e ) { ConcurrentLog.logException(e); } @@ -2754,7 +2757,8 @@ public final class Switchboard extends serverSwitch { in.documents[i], in.queueEntry.profile().scraper(), in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib, true, - this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts)); + this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts), + profile.timezoneOffset()); // update image result list statistics // its good to do this concurrently here, because it needs a DNS lookup @@ -3043,7 +3047,15 @@ public final class Switchboard extends serverSwitch { int p = userInfo == null ? -1 : userInfo.indexOf(':'); String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p); String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1); - this.crawlStacker.enqueueEntriesFTP(this.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), user, pw, false); + this.crawlStacker.enqueueEntriesFTP( + this.peers.mySeed().hash.getBytes(), + profile.handle(), + url.getHost(), + url.getPort(), + user, + pw, + false, + profile.timezoneOffset()); return null; } catch (final Exception e) { // mist @@ -3080,7 +3092,8 @@ public final class Switchboard extends serverSwitch { "CRAWLING-ROOT", new Date(), profile.handle(), - 0 + 0, + profile.timezoneOffset() )); if (reasonString != null) return reasonString; @@ -3134,7 +3147,7 @@ public final class Switchboard extends serverSwitch { * @throws IOException * @throws Parser.Failure */ - public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections, boolean doublecheck) { + public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections, final boolean doublecheck) { Map urlmap = new HashMap(); for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url); if (searchEvent != null) { @@ -3192,7 +3205,7 @@ public final class Switchboard extends serverSwitch { } final Condenser condenser = new Condenser( document, null, true, true, LibraryProvider.dymLib, true, - Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts)); + Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts), searchEvent.query.timezoneOffset); ResultImages.registerImages(url, document, true); Switchboard.this.webStructure.generateCitationReference(url, document); storeDocumentIndex( @@ -3546,7 +3559,7 @@ public final class Switchboard extends serverSwitch { final Map links; searchEvent.oneFeederStarted(); try { - links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); + links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent, searchEvent.query.timezoneOffset); if ( links != null ) { final Iterator i = links.keySet().iterator(); while ( i.hasNext() ) { @@ -3585,7 +3598,7 @@ public final class Switchboard extends serverSwitch { final Map links; DigestURL url; try { - links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); + links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent, 0); if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages final Iterator i = links.keySet().iterator(); diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index a8ef16402..aa805c4b7 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -61,18 +61,27 @@ public class DocumentIndex extends Segment { } catch (final MalformedURLException e ) { } } - BlockingQueue queue; // a queue of document ID's + private BlockingQueue queue; // a queue of document ID's private final Worker[] worker; - CallbackListener callback; + private CallbackListener callback; + private int timezoneOffset; static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup"); - public DocumentIndex(final File segmentPath, final File archivePath, final File collectionConfigurationPath, final File webgraphConfigurationPath, final CallbackListener callback, final int cachesize) + public DocumentIndex( + final File segmentPath, + final File archivePath, + final File collectionConfigurationPath, + final File webgraphConfigurationPath, + final CallbackListener callback, + final int cachesize, + final int timezoneOffset) throws IOException { super(new ConcurrentLog("DocumentIndex"), segmentPath, archivePath, collectionConfigurationPath == null ? null : new CollectionConfiguration(collectionConfigurationPath, true), webgraphConfigurationPath == null ? null : new WebgraphConfiguration(webgraphConfigurationPath, true) ); + this.timezoneOffset = timezoneOffset; super.connectRWI(cachesize, targetFileSize * 4 - 1); super.connectCitation(cachesize, targetFileSize * 4 - 1); super.fulltext().connectLocalSolr(); @@ -99,7 +108,7 @@ public class DocumentIndex extends Segment { try { while ( (f = DocumentIndex.this.queue.take()) != poison ) { try { - resultRows = add(f); + resultRows = add(f, DocumentIndex.this.timezoneOffset); for ( final SolrInputDocument resultRow : resultRows ) { if ( DocumentIndex.this.callback != null ) { if ( resultRow == null ) { @@ -132,7 +141,7 @@ public class DocumentIndex extends Segment { this.queue.clear(); } - private SolrInputDocument[] add(final AnchorURL url) throws IOException { + private SolrInputDocument[] add(final AnchorURL url, final int timezoneOffset) throws IOException { if ( url == null ) { throw new IOException("file = null"); } @@ -150,7 +159,7 @@ public class DocumentIndex extends Segment { length = -1; } try { - documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); + documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), timezoneOffset, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); } catch (final Exception e ) { throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage()); } @@ -159,7 +168,7 @@ public class DocumentIndex extends Segment { int c = 0; for ( final Document document : documents ) { if (document == null) continue; - final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true); + final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true, 0); rows[c++] = super.storeDocument( url, diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index b5bd460e2..236be7537 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -761,7 +761,7 @@ public class Segment { } // get the word set Set words = null; - words = new Condenser(document, null, true, true, null, false, false).words().keySet(); + words = new Condenser(document, null, true, true, null, false, false, 0).words().keySet(); // delete all word references int count = 0; diff --git a/source/net/yacy/search/query/AccessTracker.java b/source/net/yacy/search/query/AccessTracker.java index 07d379873..b050ee4ae 100644 --- a/source/net/yacy/search/query/AccessTracker.java +++ b/source/net/yacy/search/query/AccessTracker.java @@ -315,7 +315,7 @@ public class AccessTracker { byte[] b = new byte[GenericFormatter.PATTERN_SHORT_SECOND.length()]; raf.readFully(b); try { - return GenericFormatter.SHORT_SECOND_FORMATTER.parse(UTF8.String(b)); + return GenericFormatter.SHORT_SECOND_FORMATTER.parse(UTF8.String(b), 0).getTime(); } catch (ParseException e) { throw new IOException(e.getMessage()); } @@ -326,8 +326,8 @@ public class AccessTracker { String file = args[0]; Date from; try { - from = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[1]); - Date to = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[2]); + from = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[1], 0).getTime(); + Date to = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[2], 0).getTime(); List dump = readLog(new File(file), from, to); for (EventTracker.Event s: dump) System.out.println(s.toString()); } catch (ParseException e) { diff --git a/source/net/yacy/search/query/QueryModifier.java b/source/net/yacy/search/query/QueryModifier.java index e7daf4acb..0cb0f6942 100644 --- a/source/net/yacy/search/query/QueryModifier.java +++ b/source/net/yacy/search/query/QueryModifier.java @@ -41,8 +41,10 @@ public class QueryModifier { private final StringBuilder modifier; public String sitehost, sitehash, filetype, protocol, language, author, collection, on, from, to; + public int timezoneOffset; - public QueryModifier() { + public QueryModifier(final int timezoneOffset) { + this.timezoneOffset = timezoneOffset; this.sitehash = null; this.sitehost = null; this.filetype = null; @@ -274,19 +276,19 @@ public class QueryModifier { if (fq.indexOf(CollectionSchema.dates_in_content_dts.getSolrFieldName()) < 0) { if (this.on != null && this.on.length() > 0) { - fq.append(" AND ").append(QueryModifier.parseOnExpression(this.on)); + fq.append(" AND ").append(QueryModifier.parseOnExpression(this.on, this.timezoneOffset)); } if (this.from != null && this.from.length() > 0 && (this.to == null || this.to.equals("*"))) { - fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, null)); + fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, null, this.timezoneOffset)); } if ((this.from == null || this.from.equals("*")) && this.to != null && this.to.length() > 0) { - fq.append(" AND ").append(QueryModifier.parseFromToExpression(null, this.to)); + fq.append(" AND ").append(QueryModifier.parseFromToExpression(null, this.to, this.timezoneOffset)); } if (this.from != null && this.from.length() > 0 && this.to != null && this.to.length() > 0) { - fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, this.to)); + fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, this.to, this.timezoneOffset)); } } @@ -348,9 +350,9 @@ public class QueryModifier { return fq.toString(); } - public static String parseOnExpression(String onDescription) { + public static String parseOnExpression(final String onDescription, final int timezoneOffset) { assert onDescription != null; - Date onDate = DateDetection.parseLine(onDescription); + Date onDate = DateDetection.parseLine(onDescription, timezoneOffset); StringBuilder filterQuery = new StringBuilder(20); if (onDate != null) { @SuppressWarnings({ "deprecation", "static-access" }) @@ -360,9 +362,9 @@ public class QueryModifier { return filterQuery.toString(); } - public static String parseFromToExpression(String from, String to) { - Date fromDate = from == null || from.equals("*") ? null : DateDetection.parseLine(from); - Date toDate = to == null || to.equals("*") ? null : DateDetection.parseLine(to); + public static String parseFromToExpression(final String from, final String to, final int timezoneOffset) { + Date fromDate = from == null || from.equals("*") ? null : DateDetection.parseLine(from, timezoneOffset); + Date toDate = to == null || to.equals("*") ? null : DateDetection.parseLine(to, timezoneOffset); StringBuilder filterQuery = new StringBuilder(20); if (fromDate != null && toDate != null) { @SuppressWarnings({ "deprecation", "static-access" }) diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index d99d524b9..5adfbc0dc 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -70,7 +70,6 @@ import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; -import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.DisMaxParams; import org.apache.solr.common.params.FacetParams; import org.apache.solr.schema.TrieDateField; @@ -146,6 +145,7 @@ public final class QueryParams { public LinkedHashSet facetfields; private SolrQuery cachedQuery; private CollectionConfiguration solrSchema; + public final int timezoneOffset; public QueryParams( final QueryGoal queryGoal, @@ -154,6 +154,7 @@ public final class QueryParams { final String prefer, final ContentDomain contentdom, final String language, + final int timezoneOffset, final Collection metatags, final CacheStrategy snippetCacheStrategy, final int itemsPerPage, @@ -183,6 +184,7 @@ public final class QueryParams { this.ranking = ranking; this.maxDistance = maxDistance; this.contentdom = contentdom; + this.timezoneOffset = timezoneOffset; this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage); this.offset = Math.max(0, Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset)); try { @@ -527,19 +529,19 @@ public final class QueryParams { if (this.solrSchema.contains(CollectionSchema.dates_in_content_dts)) { if (this.modifier.on != null && this.modifier.on.length() > 0) { - fqs.add(QueryModifier.parseOnExpression(this.modifier.on)); + fqs.add(QueryModifier.parseOnExpression(this.modifier.on, this.timezoneOffset)); } if (this.modifier.from != null && this.modifier.from.length() > 0 && (this.modifier.to == null || this.modifier.to.equals("*"))) { - fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, null)); + fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, null, this.timezoneOffset)); } if ((this.modifier.from == null || this.modifier.from.equals("*")) && this.modifier.to != null && this.modifier.to.length() > 0) { - fqs.add(QueryModifier.parseFromToExpression(null, this.modifier.to)); + fqs.add(QueryModifier.parseFromToExpression(null, this.modifier.to, this.timezoneOffset)); } if (this.modifier.from != null && this.modifier.from.length() > 0 && this.modifier.to != null && this.modifier.to.length() > 0) { - fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, this.modifier.to)); + fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, this.modifier.to, this.timezoneOffset)); } } diff --git a/source/net/yacy/server/http/HTTPDProxyHandler.java b/source/net/yacy/server/http/HTTPDProxyHandler.java index 3463a0552..533b37dc3 100644 --- a/source/net/yacy/server/http/HTTPDProxyHandler.java +++ b/source/net/yacy/server/http/HTTPDProxyHandler.java @@ -358,7 +358,8 @@ public final class HTTPDProxyHandler { "", cachedResponseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), - 0); + 0, + sb.crawler.defaultProxyProfile.timezoneOffset()); final Response response = new Response( request, requestHeader, @@ -473,8 +474,8 @@ public final class HTTPDProxyHandler { "", responseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), - 0); - + 0, + sb.crawler.defaultProxyProfile.timezoneOffset()); // handle incoming cookies handleIncomingCookies(responseHeader, host, ip);