enhanced timezone managament for indexed data:

to support the new time parser and search functions in YaCy a high
precision detection of date and time on the day is necessary. That
requires that the time zone of the document content and the time zone of
the user, doing a search, is detected. The time zone of the search
request is done automatically using the browsers time zone offset which
is delivered to the search request automatically and invisible to the
user. The time zone for the content of web pages cannot be detected
automatically and must be an attribute of crawl starts. The advanced
crawl start now provides an input field to set the time zone in minutes
as an offset number. All parsers must get a time zone offset passed, so
this required the change of the parser java api. A lot of other changes
had been made which corrects the wrong handling of dates in YaCy which
was to add a correction based on the time zone of the server. Now no
correction is added and all dates in YaCy are UTC/GMT time zone, a
normalized time zone for all peers.
pull/2/head
Michael Peter Christen 10 years ago
parent 702c30e619
commit fed26f33a8

@ -513,7 +513,7 @@
</dl> </dl>
</fieldset> </fieldset>
<fieldset> <fieldset>
<legend>Index Administration</legend> <legend>Index Attributes</legend>
<dl> <dl>
<dt>Indexing</dt> <dt>Indexing</dt>
<dd> <dd>
@ -561,6 +561,17 @@
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# /> <input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
</dd> </dd>
<dt><label for="collection">Time Zone Offset</label></dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
The time zone is required when the parser detects a date in the crawled web page. Content can be searched with the on: - modifier which
requires also a time zone when a query is made. To normalize all given dates, the date is stored in UTC time zone. To get the right offset
from dates without time zones to UTC, this offset must be given here. The offset is given in minutes;
Time zone offsets for locations east of UTC must be negative; offsets for zones west of UTC must be positve.
</span></span>
<input id="timezoneOffset" type="text" size="4" maxlength="4" name="timezoneOffset" value=""><script>document.getElementById("timezoneOffset").value = new Date().getTimezoneOffset();</script>
</dd>
</dl> </dl>
</fieldset> </fieldset>

@ -91,6 +91,7 @@
<input type="hidden" name="indexText" id="indexText" value="on" /> <input type="hidden" name="indexText" id="indexText" value="on" />
<input type="hidden" name="indexMedia" id="indexMedia" value="on" /> <input type="hidden" name="indexMedia" id="indexMedia" value="on" />
<input type="hidden" name="intention" id="intention" value="" /> <input type="hidden" name="intention" id="intention" value="" />
<input id="timezoneOffset" type="hidden" name="timezoneOffset" value=""><script>document.getElementById("timezoneOffset").value = new Date().getTimezoneOffset();</script>
<input type="submit" name="crawlingstart" value="Start New Crawl" class="btn btn-primary"/> <input type="submit" name="crawlingstart" value="Start New Crawl" class="btn btn-primary"/>
</dd> </dd>
</dl> </dl>

@ -470,6 +470,8 @@ public class Crawler_p {
} }
} }
int timezoneOffset = post.getInt("timezoneOffset", 0);
// prepare a new crawling profile // prepare a new crawling profile
final CrawlProfile profile; final CrawlProfile profile;
byte[] handle; byte[] handle;
@ -502,7 +504,8 @@ public class Crawler_p {
cachePolicy, cachePolicy,
collection, collection,
agentName, agentName,
new VocabularyScraper(vocabulary_scraper)); new VocabularyScraper(vocabulary_scraper),
timezoneOffset);
handle = ASCII.getBytes(profile.handle()); handle = ASCII.getBytes(profile.handle());
// before we fire up a new crawl, we make sure that another crawl with the same name is not running // before we fire up a new crawl, we make sure that another crawl with the same name is not running
@ -585,7 +588,7 @@ public class Crawler_p {
try { try {
// check if the crawl filter works correctly // check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch); Pattern.compile(newcrawlingMustMatch);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper()); final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false); final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if (crawlingFile != null && crawlingFile.exists()) { if (crawlingFile != null && crawlingFile.exists()) {
FileUtils.copy(new FileInputStream(crawlingFile), writer); FileUtils.copy(new FileInputStream(crawlingFile), writer);
@ -605,7 +608,7 @@ public class Crawler_p {
} }
sb.crawler.putActive(handle, profile); sb.crawler.putActive(handle, profile);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks); sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, profile.timezoneOffset());
} catch (final PatternSyntaxException e) { } catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);

@ -161,7 +161,8 @@ public class HostBrowser {
sb.peers.mySeed().hash.getBytes(), sb.peers.mySeed().hash.getBytes(),
url, null, load, new Date(), url, null, load, new Date(),
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0 0,
sb.crawler.defaultProxyProfile.timezoneOffset()
)); ));
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString)); prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) waitloop: for (int i = 0; i < 30; i++) { if (wait) waitloop: for (int i = 0; i < 30; i++) {

@ -637,11 +637,12 @@ public class IndexControlRWIs_p {
final QueryGoal qg = new QueryGoal(queryhashes, null); final QueryGoal qg = new QueryGoal(queryhashes, null);
final QueryParams query = new QueryParams( final QueryParams query = new QueryParams(
qg, qg,
new QueryModifier(), new QueryModifier(0),
Integer.MAX_VALUE, Integer.MAX_VALUE,
"", "",
ContentDomain.ALL, ContentDomain.ALL,
"", //lang "", //lang
0, //timezoneOffset
null, null,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
1000, 0, //count, offset 1000, 0, //count, offset

@ -74,7 +74,7 @@ public class NetworkHistory {
while (rowi.hasNext()) { while (rowi.hasNext()) {
Row row = rowi.next(); Row row = rowi.next();
String d = ASCII.String(row.getPK()); String d = ASCII.String(row.getPK());
Date date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d); Date date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d, 0).getTime();
if (date.getTime() < timelimit) break; if (date.getTime() < timelimit) break;
statrow = new HashMap<>(); statrow = new HashMap<>();
for (String key: columns) { for (String key: columns) {

@ -128,7 +128,8 @@ public class QuickCrawlLink_p {
final byte[] urlhash = crawlingStartURL.hash(); final byte[] urlhash = crawlingStartURL.hash();
indexSegment.fulltext().remove(urlhash); indexSegment.fulltext().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
int timezoneOffset = post.getInt("timezoneOffset", 0);
// create crawling profile // create crawling profile
CrawlProfile pe = null; CrawlProfile pe = null;
try { try {
@ -156,7 +157,8 @@ public class QuickCrawlLink_p {
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
collection, collection,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
null); null,
timezoneOffset);
sb.crawler.putActive(pe.handle().getBytes(), pe); sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) { } catch (final Exception e) {
// mist // mist
@ -175,7 +177,8 @@ public class QuickCrawlLink_p {
(title==null)?"CRAWLING-ROOT":title, (title==null)?"CRAWLING-ROOT":title,
new Date(), new Date(),
pe.handle(), pe.handle(),
0 0,
pe.timezoneOffset()
)); ));
// validate rejection reason // validate rejection reason

@ -39,7 +39,7 @@ public class get {
Date parsedDate = null; Date parsedDate = null;
try { try {
parsedDate = ISO8601Formatter.FORMATTER.parse(date); parsedDate = ISO8601Formatter.FORMATTER.parse(date, 0).getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
parsedDate = new Date(); parsedDate = new Date();
} }

@ -103,7 +103,8 @@ public class push_p {
"", // the name of the document to crawl "", // the name of the document to crawl
new Date(), // current date new Date(), // current date
profile.handle(), // the name of the prefetch profile. This must not be null! profile.handle(), // the name of the prefetch profile. This must not be null!
0); // forkfactor sum of anchors of all ancestors 0, // forkfactor sum of anchors of all ancestors
profile.timezoneOffset());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,

@ -75,8 +75,8 @@ public final class timeline_p {
// get a time period // get a time period
Date fromDate = new Date(0); Date fromDate = new Date(0);
Date toDate = new Date(); Date toDate = new Date();
try {fromDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("from", "20031215182700"));} catch (ParseException e) {} try {fromDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("from", "20031215182700"), 0).getTime();} catch (ParseException e) {}
try {toDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("to", GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date())));} catch (ParseException e) {} try {toDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("to", GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date())), 0).getTime();} catch (ParseException e) {}
// get latest dump; // get latest dump;
AccessTracker.dumpLog(); AccessTracker.dumpLog();

@ -80,6 +80,7 @@
<input type="hidden" name="resource" value="global" /> <input type="hidden" name="resource" value="global" />
<input type="hidden" name="prefermaskfilter" value="" /> <input type="hidden" name="prefermaskfilter" value="" />
<input type="hidden" name="maximumRecords" value="#[maximumRecords]#" /> <input type="hidden" name="maximumRecords" value="#[maximumRecords]#" />
<input id="timezoneOffset" type="hidden" name="timezoneOffset" value=""><script>document.getElementById("timezoneOffset").value = new Date().getTimezoneOffset();</script>
</fieldset> </fieldset>
:: ::
</fieldset> </fieldset>

@ -78,7 +78,8 @@ public class rct_p {
"REMOTE-CRAWLING", "REMOTE-CRAWLING",
loaddate, loaddate,
sb.crawler.defaultRemoteProfile.handle(), sb.crawler.defaultRemoteProfile.handle(),
0)); 0,
sb.crawler.defaultRemoteProfile.timezoneOffset()));
} else { } else {
env.getLog().warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); env.getLog().warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
} }

@ -118,7 +118,8 @@ public final class search {
final String prefer = post.get("prefer", ""); final String prefer = post.get("prefer", "");
final String contentdom = post.get("contentdom", "all"); final String contentdom = post.get("contentdom", "all");
final String filter = post.get("filter", ".*"); // a filter on the url final String filter = post.get("filter", ".*"); // a filter on the url
QueryModifier modifier = new QueryModifier(); final int timezoneOffset = post.getInt("timezoneOffset", 0);
QueryModifier modifier = new QueryModifier(timezoneOffset);
modifier.sitehost = post.get("sitehost", ""); if (modifier.sitehost.isEmpty()) modifier.sitehost = null; modifier.sitehost = post.get("sitehost", ""); if (modifier.sitehost.isEmpty()) modifier.sitehost = null;
modifier.sitehash = post.get("sitehash", ""); if (modifier.sitehash.isEmpty()) modifier.sitehash = null; modifier.sitehash = post.get("sitehash", ""); if (modifier.sitehash.isEmpty()) modifier.sitehash = null;
modifier.author = post.get("author", ""); if (modifier.author.isEmpty()) modifier.author = null; modifier.author = post.get("author", ""); if (modifier.author.isEmpty()) modifier.author = null;
@ -232,6 +233,7 @@ public final class search {
prefer, prefer,
ContentDomain.contentdomParser(contentdom), ContentDomain.contentdomParser(contentdom),
language, language,
timezoneOffset,
new HashSet<Tagging.Metatag>(), new HashSet<Tagging.Metatag>(),
null, // no snippet computation null, // no snippet computation
count, count,
@ -297,6 +299,7 @@ public final class search {
prefer, prefer,
ContentDomain.contentdomParser(contentdom), ContentDomain.contentdomParser(contentdom),
language, language,
timezoneOffset,
new HashSet<Tagging.Metatag>(), new HashSet<Tagging.Metatag>(),
null, // no snippet computation null, // no snippet computation
count, count,

@ -55,7 +55,7 @@ public final class transferURL {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
final long start = System.currentTimeMillis(); final long start = System.currentTimeMillis();
long freshdate = 0; long freshdate = 0;
try {freshdate = GenericFormatter.SHORT_DAY_FORMATTER.parse("20061101").getTime();} catch (final ParseException e1) {} try {freshdate = GenericFormatter.SHORT_DAY_FORMATTER.parse("20061101", 0).getTime().getTime();} catch (final ParseException e1) {}
// return variable that accumulates replacements // return variable that accumulates replacements
final Switchboard sb = (Switchboard) env; final Switchboard sb = (Switchboard) env;

@ -108,6 +108,7 @@ Use the RSS search result format to add static searches to your RSS reader, if y
<input type="hidden" name="depth" value="#[depth]#" /> <input type="hidden" name="depth" value="#[depth]#" />
<input type="hidden" name="constraint" value="#[constraint]#" /> <input type="hidden" name="constraint" value="#[constraint]#" />
<input type="hidden" name="meanCount" value="#[meanCount]#" /> <input type="hidden" name="meanCount" value="#[meanCount]#" />
<input id="timezoneOffset" type="hidden" name="timezoneOffset" value=""><script>document.getElementById("timezoneOffset").value = new Date().getTimezoneOffset();</script>
</form> </form>
<!-- type the number of results and navigation bar --> <!-- type the number of results and navigation bar -->

@ -214,6 +214,9 @@ public class yacysearch {
prop.setOutgoingHeader(outgoingHeader); prop.setOutgoingHeader(outgoingHeader);
} }
// time zone
int timezoneOffset = post.getInt("timezoneOffset", 0);
// collect search attributes // collect search attributes
int itemsPerPage = int itemsPerPage =
@ -359,7 +362,7 @@ public class yacysearch {
} }
final RankingProfile ranking = sb.getRanking(); final RankingProfile ranking = sb.getRanking();
final QueryModifier modifier = new QueryModifier(); final QueryModifier modifier = new QueryModifier(timezoneOffset);
querystring = modifier.parse(querystring); querystring = modifier.parse(querystring);
if (modifier.sitehost != null && modifier.sitehost.length() > 0 && querystring.length() == 0) querystring = "*"; // allow to search for all documents on a host if (modifier.sitehost != null && modifier.sitehost.length() > 0 && querystring.length() == 0) querystring = "*"; // allow to search for all documents on a host
@ -643,6 +646,7 @@ public class yacysearch {
prefermask, prefermask,
contentdom, contentdom,
language, language,
timezoneOffset,
metatags, metatags,
snippetFetchStrategy, snippetFetchStrategy,
itemsPerPage, itemsPerPage,

@ -390,9 +390,9 @@ public class yacysearchtrailer {
navigatorIterator = theSearch.dateNavigator.iterator(); // this iterator is different as it iterates by the key order (which is a date order) navigatorIterator = theSearch.dateNavigator.iterator(); // this iterator is different as it iterates by the key order (which is a date order)
int i = 0, pos = 0, neg = 0; int i = 0, pos = 0, neg = 0;
long dx = -1; long dx = -1;
Date fromconstraint = theSearch.getQuery().modifier.from == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.from); Date fromconstraint = theSearch.getQuery().modifier.from == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.from, theSearch.getQuery().timezoneOffset);
if (fromconstraint == null) fromconstraint = new Date(System.currentTimeMillis() - AbstractFormatter.normalyearMillis); if (fromconstraint == null) fromconstraint = new Date(System.currentTimeMillis() - AbstractFormatter.normalyearMillis);
Date toconstraint = theSearch.getQuery().modifier.to == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.to); Date toconstraint = theSearch.getQuery().modifier.to == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.to, theSearch.getQuery().timezoneOffset);
if (toconstraint == null) toconstraint = new Date(System.currentTimeMillis() + AbstractFormatter.normalyearMillis); if (toconstraint == null) toconstraint = new Date(System.currentTimeMillis() + AbstractFormatter.normalyearMillis);
while (i < QueryParams.FACETS_DATE_MAXCOUNT && navigatorIterator.hasNext()) { while (i < QueryParams.FACETS_DATE_MAXCOUNT && navigatorIterator.hasNext()) {
name = navigatorIterator.next().trim(); name = navigatorIterator.next().trim();

@ -25,13 +25,19 @@
package net.yacy.cora.date; package net.yacy.cora.date;
import java.text.ParseException; import java.text.ParseException;
import java.util.Calendar;
import java.util.Date; import java.util.Date;
import java.util.TimeZone; import java.util.TimeZone;
public abstract class AbstractFormatter implements DateFormatter { public abstract class AbstractFormatter implements DateFormatter {
protected static final TimeZone TZ_GMT = TimeZone.getTimeZone("GMT"); public final static Calendar testCalendar = Calendar.getInstance(); // a calendar in the current time zone of the server
public final static Calendar UTCCalendar = Calendar.getInstance();
public final static TimeZone UTCtimeZone = TimeZone.getTimeZone("UTC");
static {
UTCCalendar.setTimeZone(UTCtimeZone);
}
// statics // statics
public final static long secondMillis = 1000; public final static long secondMillis = 1000;
public final static long minuteMillis = 60 * secondMillis; public final static long minuteMillis = 60 * secondMillis;
@ -45,7 +51,7 @@ public abstract class AbstractFormatter implements DateFormatter {
protected String last_format; protected String last_format;
@Override @Override
public abstract Date parse(String s) throws ParseException; public abstract Calendar parse(String s, int timezoneOffset) throws ParseException;
@Override @Override
public abstract String format(final Date date); public abstract String format(final Date date);
@Override @Override

@ -25,11 +25,12 @@
package net.yacy.cora.date; package net.yacy.cora.date;
import java.text.ParseException; import java.text.ParseException;
import java.util.Calendar;
import java.util.Date; import java.util.Date;
public interface DateFormatter { public interface DateFormatter {
public Date parse(String s) throws ParseException; public Calendar parse(String s, int timezoneOffset) throws ParseException;
public String format(final Date date); public String format(final Date date);
public String format(); public String format();

@ -30,6 +30,7 @@ import java.text.SimpleDateFormat;
import java.util.Calendar; import java.util.Calendar;
import java.util.Date; import java.util.Date;
import java.util.Locale; import java.util.Locale;
import java.util.TimeZone;
import net.yacy.cora.util.NumberTools; import net.yacy.cora.util.NumberTools;
@ -51,14 +52,11 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter
public static final SimpleDateFormat FORMAT_ANSIC = new SimpleDateFormat(PATTERN_ANSIC, Locale.US); public static final SimpleDateFormat FORMAT_ANSIC = new SimpleDateFormat(PATTERN_ANSIC, Locale.US);
public static final SimpleDateFormat FORMAT_SIMPLE = new SimpleDateFormat(PATTERN_SIMPLE, Locale.US); public static final SimpleDateFormat FORMAT_SIMPLE = new SimpleDateFormat(PATTERN_SIMPLE, Locale.US);
// find out time zone and DST offset
private static Calendar thisCalendar = Calendar.getInstance();
static { static {
// we want GMT times on the formats as well as they don't support any timezone // we want GMT times on the formats as well as they don't support any timezone
FORMAT_SHORT_DAY.setTimeZone(TZ_GMT); FORMAT_SHORT_DAY.setTimeZone(UTCtimeZone);
FORMAT_SHORT_SECOND.setTimeZone(TZ_GMT); FORMAT_SHORT_SECOND.setTimeZone(UTCtimeZone);
FORMAT_SHORT_MILSEC.setTimeZone(TZ_GMT); FORMAT_SHORT_MILSEC.setTimeZone(UTCtimeZone);
} }
public static final long time_second = 1000L; public static final long time_second = 1000L;
@ -124,56 +122,55 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter
* the String. * the String.
*/ */
@Override @Override
public Date parse(final String timeString) throws ParseException { public Calendar parse(final String timeString, final int timezoneOffset) throws ParseException {
synchronized (this.dateFormat) { synchronized (this.dateFormat) {
return this.dateFormat.parse(timeString); Calendar cal = Calendar.getInstance(UTCtimeZone);
cal.setTime(this.dateFormat.parse(timeString));
cal.add(Calendar.MINUTE, timezoneOffset); // add a correction; i.e. for UTC+1 -60 minutes is added to patch a time given in UTC+1 to the actual time at UTC
return cal;
} }
} }
/** /**
* Like {@link #parseShortSecond(String)} using additional timezone information provided in an * Like {@link #parseShortSecond(String)} using additional timezone information provided in an
* offset String, like "+0100" for CET. * offset String, like "+0100" for CET.
* @throws ParseException
*/ */
public Date parse(final String timeString, final String UTCOffset) { public Calendar parse(final String timeString, final String UTCOffset) throws ParseException {
// FIXME: This method returns an incorrect date, check callers! // FIXME: This method returns an incorrect date, check callers!
// ex: de.anomic.server.serverDate.parseShortSecond("20070101120000", "+0200").toGMTString() // ex: de.anomic.server.serverDate.parseShortSecond("20070101120000", "+0200").toGMTString()
// => 1 Jan 2007 13:00:00 GMT // => 1 Jan 2007 13:00:00 GMT
if (timeString == null || timeString.isEmpty()) { return new Date(); } if (timeString == null || timeString.isEmpty()) { return Calendar.getInstance(UTCtimeZone); }
if (UTCOffset == null || UTCOffset.isEmpty()) { return new Date(); } if (UTCOffset == null || UTCOffset.isEmpty()) { return Calendar.getInstance(UTCtimeZone); }
try { return parse(timeString, UTCDiff(UTCOffset));
synchronized (this.dateFormat) {
return new Date(this.dateFormat.parse(timeString).getTime() - UTCDiff() + UTCDiff(UTCOffset));
}
} catch (final Throwable e) {
//serverLog.logFinest("parseUniversalDate", e.getMessage() + ", remoteTimeString=[" + remoteTimeString + "]");
return new Date();
}
} }
private static long UTCDiff(final String diffString) { private static int UTCDiff(final String diffString) {
if (diffString.length() != 5) throw new IllegalArgumentException("UTC String malformed (wrong size):" + diffString); if (diffString.length() != 5) throw new IllegalArgumentException("UTC String malformed (wrong size):" + diffString);
boolean ahead = true; boolean ahead = true;
if (diffString.length() > 0 && diffString.charAt(0) == '+') ahead = true; if (diffString.length() > 0 && diffString.charAt(0) == '+') ahead = true;
else if (diffString.length() > 0 && diffString.charAt(0) == '-') ahead = false; else if (diffString.length() > 0 && diffString.charAt(0) == '-') ahead = false;
else throw new IllegalArgumentException("UTC String malformed (wrong sign):" + diffString); else throw new IllegalArgumentException("UTC String malformed (wrong sign):" + diffString);
final long oh = NumberTools.parseLongDecSubstring(diffString, 1, 3); final int oh = NumberTools.parseIntDecSubstring(diffString, 1, 3);
final long om = NumberTools.parseLongDecSubstring(diffString, 3); final int om = NumberTools.parseIntDecSubstring(diffString, 3);
return ((ahead) ? (long) 1 : (long) -1) * (oh * AbstractFormatter.hourMillis + om * AbstractFormatter.minuteMillis); return (int) ((ahead) ? 1 : -1 * (oh * AbstractFormatter.hourMillis + om * AbstractFormatter.minuteMillis));
} }
/**
* get the difference of this servers time zone to UTC/GMT in milliseconds
* @return
*/
private static long UTCDiff() { private static long UTCDiff() {
// DST_OFFSET is dependent on the time of the Calendar, so it has to be updated // DST_OFFSET is dependent on the time of the Calendar, so it has to be updated
// to get the correct current offset // to get the correct current offset
synchronized (thisCalendar) { synchronized (testCalendar) {
thisCalendar.setTimeInMillis(System.currentTimeMillis()); testCalendar.setTimeInMillis(System.currentTimeMillis());
final long zoneOffsetHours = thisCalendar.get(Calendar.ZONE_OFFSET); final long zoneOffsetHours = testCalendar.get(Calendar.ZONE_OFFSET);
final long DSTOffsetHours = thisCalendar.get(Calendar.DST_OFFSET); final long DSTOffsetHours = testCalendar.get(Calendar.DST_OFFSET);
return zoneOffsetHours + DSTOffsetHours; return zoneOffsetHours + DSTOffsetHours;
} }
} }
private final static DecimalFormat D2 = new DecimalFormat("00");
public static String UTCDiffString() { public static String UTCDiffString() {
// we express the UTC Difference in 5 digits: // we express the UTC Difference in 5 digits:
// SHHMM // SHHMM
@ -195,11 +192,9 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter
return sb.toString(); return sb.toString();
} }
public static long correctedUTCTime() { private final static DecimalFormat D2 = new DecimalFormat("00");
return System.currentTimeMillis() - UTCDiff();
}
public static void main(final String[] args) { public static void main(String[] args) {
System.out.println(UTCDiffString()); System.out.println(UTCDiffString());
} }
} }

@ -41,7 +41,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
private static final SimpleDateFormat FORMAT_ISO8601 = new SimpleDateFormat(PATTERN_ISO8601, Locale.US); private static final SimpleDateFormat FORMAT_ISO8601 = new SimpleDateFormat(PATTERN_ISO8601, Locale.US);
static { static {
FORMAT_ISO8601.setTimeZone(TZ_GMT); FORMAT_ISO8601.setTimeZone(AbstractFormatter.UTCtimeZone);
} }
public static final ISO8601Formatter FORMATTER = new ISO8601Formatter(); public static final ISO8601Formatter FORMATTER = new ISO8601Formatter();
@ -72,7 +72,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
* @throws ParseException * @throws ParseException
*/ */
@Override @Override
public Date parse(String s) throws ParseException { public Calendar parse(String s, final int timezoneOffset) throws ParseException {
// do some lazy checks here // do some lazy checks here
s = s.trim(); s = s.trim();
while (!s.isEmpty() && s.endsWith("?")) s = s.substring(0, s.length() - 1); // sometimes used if write is not sure about date while (!s.isEmpty() && s.endsWith("?")) s = s.substring(0, s.length() - 1); // sometimes used if write is not sure about date
@ -87,7 +87,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
while (!s.isEmpty() && s.endsWith("?")) s = s.substring(0, s.length() - 1); // sometimes used if write is not sure about date while (!s.isEmpty() && s.endsWith("?")) s = s.substring(0, s.length() - 1); // sometimes used if write is not sure about date
// no go for exact parsing // no go for exact parsing
final Calendar cal = Calendar.getInstance(TZ_GMT, Locale.US); final Calendar cal = Calendar.getInstance(AbstractFormatter.UTCtimeZone, Locale.US);
cal.clear(); cal.clear();
// split 2007-12-19T10:20:30.789+0500 into its parts // split 2007-12-19T10:20:30.789+0500 into its parts
@ -103,13 +103,13 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
if (t.nextToken().equals("-")) { if (t.nextToken().equals("-")) {
cal.set(Calendar.MONTH, Integer.parseInt(t.nextToken()) - 1); cal.set(Calendar.MONTH, Integer.parseInt(t.nextToken()) - 1);
} else { } else {
return cal.getTime(); return cal;
} }
// day // day
if (t.nextToken().equals("-")) { if (t.nextToken().equals("-")) {
cal.set(Calendar.DAY_OF_MONTH, Integer.parseInt(t.nextToken())); cal.set(Calendar.DAY_OF_MONTH, Integer.parseInt(t.nextToken()));
} else { } else {
return cal.getTime(); return cal;
} }
// The standard says: // The standard says:
// if there is an hour there has to be a minute and a timezone token, too. // if there is an hour there has to be a minute and a timezone token, too.
@ -147,7 +147,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
sign = -1; sign = -1;
} else { } else {
// no legal TZ offset found // no legal TZ offset found
return cal.getTime(); return cal;
} }
offset = sign * Integer.parseInt(t.nextToken()) * 10 * 3600; offset = sign * Integer.parseInt(t.nextToken()) * 10 * 3600;
} }
@ -168,8 +168,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
// in case we couldn't even parse a year // in case we couldn't even parse a year
if (!cal.isSet(Calendar.YEAR)) if (!cal.isSet(Calendar.YEAR))
throw new ParseException("parseISO8601: Cannot parse '" + s + "'", 0); throw new ParseException("parseISO8601: Cannot parse '" + s + "'", 0);
Date d = cal.getTime(); return cal;
return d;
} }

@ -224,7 +224,7 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
date = HeaderFramework.FORMAT_RFC1123.parse(dateString); date = HeaderFramework.FORMAT_RFC1123.parse(dateString);
} catch (final ParseException e) { } catch (final ParseException e) {
try { try {
date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(dateString); date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(dateString, 0).getTime();
} catch (final ParseException e1) { } catch (final ParseException e1) {
date = HeaderFramework.parseHTTPDate(dateString); // returns null on parse error date = HeaderFramework.parseHTTPDate(dateString); // returns null on parse error
} }

@ -183,11 +183,12 @@ public class FederateSearchManager {
Bitfield filter = new Bitfield(); Bitfield filter = new Bitfield();
final QueryParams query = new QueryParams( final QueryParams query = new QueryParams(
qg, qg,
new QueryModifier(), new QueryModifier(0),
Integer.MAX_VALUE, Integer.MAX_VALUE,
"", "",
Classification.ContentDomain.ALL, Classification.ContentDomain.ALL,
"", //lang "", //lang
0, //timezoneOffset
null, null,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
100, 0, //count, offset 100, 0, //count, offset

@ -151,17 +151,26 @@ public final class CrawlStacker {
if (CrawlStacker.log.isFinest()) CrawlStacker.log.finest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth()); if (CrawlStacker.log.isFinest()) CrawlStacker.log.finest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth());
this.requestQueue.enQueue(entry); this.requestQueue.enQueue(entry);
} }
public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final List<AnchorURL> hyperlinks) { public void enqueueEntriesAsynchronous(
final byte[] initiator,
final String profileHandle,
final List<AnchorURL> hyperlinks,
final int timezoneOffset) {
new Thread() { new Thread() {
@Override @Override
public void run() { public void run() {
Thread.currentThread().setName("enqueueEntriesAsynchronous"); Thread.currentThread().setName("enqueueEntriesAsynchronous");
enqueueEntries(initiator, profileHandle, hyperlinks, true); enqueueEntries(initiator, profileHandle, hyperlinks, true, timezoneOffset);
} }
}.start(); }.start();
} }
private void enqueueEntries(final byte[] initiator, final String profileHandle, final List<AnchorURL> hyperlinks, final boolean replace) { private void enqueueEntries(
final byte[] initiator,
final String profileHandle,
final List<AnchorURL> hyperlinks,
final boolean replace,
final int timezoneOffset) {
if (replace) { if (replace) {
// delete old entries, if exists to force a re-load of the url (thats wanted here) // delete old entries, if exists to force a re-load of the url (thats wanted here)
Set<String> hosthashes = new HashSet<String>(); Set<String> hosthashes = new HashSet<String>();
@ -199,7 +208,7 @@ public final class CrawlStacker {
int p = userInfo == null ? -1 : userInfo.indexOf(':'); int p = userInfo == null ? -1 : userInfo.indexOf(':');
String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p); String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p);
String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1); String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1);
enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), user, pw, replace); enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), user, pw, replace, timezoneOffset);
} else { } else {
// put entry on crawl stack // put entry on crawl stack
enqueueEntry(new Request( enqueueEntry(new Request(
@ -209,13 +218,22 @@ public final class CrawlStacker {
url.getNameProperty(), url.getNameProperty(),
new Date(), new Date(),
profileHandle, profileHandle,
0 0,
timezoneOffset
)); ));
} }
} }
} }
public void enqueueEntriesFTP(final byte[] initiator, final String profileHandle, final String host, final int port, final String user, final String pw, final boolean replace) { public void enqueueEntriesFTP(
final byte[] initiator,
final String profileHandle,
final String host,
final int port,
final String user,
final String pw,
final boolean replace,
final int timezoneOffset) {
final CrawlQueues cq = this.nextQueue; final CrawlQueues cq = this.nextQueue;
new Thread() { new Thread() {
@Override @Override
@ -248,7 +266,8 @@ public final class CrawlStacker {
MultiProtocolURL.unescape(entry.name), MultiProtocolURL.unescape(entry.name),
entry.date, entry.date,
profileHandle, profileHandle,
0)); 0,
timezoneOffset));
} }
} catch (final IOException e1) { } catch (final IOException e1) {
ConcurrentLog.logException(e1); ConcurrentLog.logException(e1);
@ -272,7 +291,7 @@ public final class CrawlStacker {
"CRAWLING-ROOT", "CRAWLING-ROOT",
new Date(), new Date(),
pe.handle(), pe.handle(),
0)); 0, 0));
} }
/** /**

@ -296,7 +296,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY, "robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName, ClientIdentification.yacyProxyAgentName,
null); null,
0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()), UTF8.getBytes(this.defaultProxyProfile.handle()),
this.defaultProxyProfile); this.defaultProxyProfile);
@ -327,7 +328,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE, "robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName, ClientIdentification.yacyInternetCrawlerAgentName,
null); null,
0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()), UTF8.getBytes(this.defaultRemoteProfile.handle()),
this.defaultRemoteProfile); this.defaultRemoteProfile);
@ -358,7 +360,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
null); null,
0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
this.defaultTextSnippetLocalProfile); this.defaultTextSnippetLocalProfile);
@ -389,7 +392,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
null); null,
0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile); this.defaultTextSnippetGlobalProfile);
@ -421,7 +425,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName, ClientIdentification.browserAgentName,
null); null,
0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile); this.defaultTextSnippetGlobalProfile);
@ -452,7 +457,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
null); null,
0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
this.defaultMediaSnippetLocalProfile); this.defaultMediaSnippetLocalProfile);
@ -483,7 +489,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
null); null,
0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
this.defaultMediaSnippetGlobalProfile); this.defaultMediaSnippetGlobalProfile);
@ -514,7 +521,8 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE, "robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
null); null,
0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()), UTF8.getBytes(this.defaultSurrogateProfile.handle()),
this.defaultSurrogateProfile); this.defaultSurrogateProfile);
@ -548,7 +556,8 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
collection, collection,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
null); null,
0);
this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile); this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);
this.defaultPushProfiles.put(collection, genericPushProfile); this.defaultPushProfiles.put(collection, genericPushProfile);
return genericPushProfile; return genericPushProfile;

@ -80,6 +80,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String CACHE_STRAGEGY = "cacheStrategy"; public static final String CACHE_STRAGEGY = "cacheStrategy";
public static final String COLLECTIONS = "collections"; public static final String COLLECTIONS = "collections";
public static final String SCRAPER = "scraper"; public static final String SCRAPER = "scraper";
public static final String TIMEZONEOFFSET = "timezoneOffset";
public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch"; public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch";
public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch"; public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch";
public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch"; public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch";
@ -131,6 +132,9 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @param xpstopw true if parent stop words shall be ignored * @param xpstopw true if parent stop words shall be ignored
* @param cacheStrategy determines if and how cache is used loading content * @param cacheStrategy determines if and how cache is used loading content
* @param collections a comma-separated list of tags which are attached to index entries * @param collections a comma-separated list of tags which are attached to index entries
* @param userAgentName the profile name of the user agent to be used
* @param scraper a scraper for vocabularies
* @param timezoneOffset the time offset in minutes for scraped dates in text without time zone
*/ */
public CrawlProfile( public CrawlProfile(
String name, String name,
@ -155,7 +159,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final CacheStrategy cacheStrategy, final CacheStrategy cacheStrategy,
final String collections, final String collections,
final String userAgentName, final String userAgentName,
final VocabularyScraper scraper) { final VocabularyScraper scraper,
final int timezoneOffset) {
super(40); super(40);
if (name == null || name.isEmpty()) { if (name == null || name.isEmpty()) {
throw new NullPointerException("name must not be null or empty"); throw new NullPointerException("name must not be null or empty");
@ -198,6 +203,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
String jsonString = this.scraper.toString(); String jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString; assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
put(SCRAPER, jsonString); put(SCRAPER, jsonString);
put(TIMEZONEOFFSET, timezoneOffset);
} }
/** /**
@ -623,6 +629,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public int timezoneOffset() {
final String timezoneOffset = get(TIMEZONEOFFSET);
if (timezoneOffset == null) return 0;
try {
return Integer.parseInt(timezoneOffset);
} catch (NumberFormatException e) {
return 0;
}
}
/** /**
* get a recrawl date for a given age in minutes * get a recrawl date for a given age in minutes
* @param oldTimeMinutes * @param oldTimeMinutes

@ -531,7 +531,8 @@ public class CrawlQueues {
item.getDescriptions().size() > 0 ? item.getDescriptions().get(0) : "", item.getDescriptions().size() > 0 ? item.getDescriptions().get(0) : "",
loaddate, loaddate,
this.sb.crawler.defaultRemoteProfile.handle(), this.sb.crawler.defaultRemoteProfile.handle(),
0 0,
this.sb.crawler.defaultRemoteProfile.timezoneOffset()
)); ));
} else { } else {
CrawlQueues.log.warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); CrawlQueues.log.warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);

@ -359,10 +359,10 @@ public class Snapshots {
private static Date parseDate(String d) { private static Date parseDate(String d) {
try { try {
return GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d); return GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d, 0).getTime();
} catch (ParseException e) { } catch (ParseException e) {
try { try {
return GenericFormatter.SHORT_DAY_FORMATTER.parse(d); return GenericFormatter.SHORT_DAY_FORMATTER.parse(d, 0).getTime();
} catch (ParseException ee) { } catch (ParseException ee) {
return null; return null;
} }

@ -92,7 +92,8 @@ public class Request extends WorkflowJob
private Bitfield flags; private Bitfield flags;
private String statusMessage; private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
private int timezoneOffset;
public Request() { public Request() {
// used only to create poison entries // used only to create poison entries
this.initiator = null; this.initiator = null;
@ -106,6 +107,7 @@ public class Request extends WorkflowJob
this.statusMessage = null; this.statusMessage = null;
this.initialHash = 0; this.initialHash = 0;
this.status = 0; this.status = 0;
this.timezoneOffset = 0;
} }
/** /**
@ -115,7 +117,7 @@ public class Request extends WorkflowJob
* @param referrerhash * @param referrerhash
*/ */
public Request(final DigestURL url, final byte[] referrerhash) { public Request(final DigestURL url, final byte[] referrerhash) {
this(null, url, referrerhash, null, null, null, 0); this(null, url, referrerhash, null, null, null, 0, 0);
} }
/** /**
@ -136,7 +138,8 @@ public class Request extends WorkflowJob
final String name, final String name,
final Date appdate, final Date appdate,
final String profileHandle, final String profileHandle,
final int depth) { final int depth,
final int timezoneOffset) {
// create new entry and store it into database // create new entry and store it into database
assert url != null; assert url != null;
assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle
@ -150,6 +153,7 @@ public class Request extends WorkflowJob
this.appdate = (appdate == null) ? 0 : appdate.getTime(); this.appdate = (appdate == null) ? 0 : appdate.getTime();
this.profileHandle = profileHandle; // must not be null this.profileHandle = profileHandle; // must not be null
this.depth = depth; this.depth = depth;
this.timezoneOffset = timezoneOffset;
this.flags = new Bitfield(rowdef.width(10)); this.flags = new Bitfield(rowdef.width(10));
this.statusMessage = "loaded(args)"; this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode(); this.initialHash = url.hashCode();
@ -271,6 +275,10 @@ public class Request extends WorkflowJob
// crawl depth where the url appeared // crawl depth where the url appeared
return this.depth; return this.depth;
} }
public int timezoneOffset() {
return this.timezoneOffset;
}
public String profileHandle() { public String profileHandle() {
// the handle of the crawl profile // the handle of the crawl profile

@ -28,7 +28,6 @@ package net.yacy.crawler.retrieval;
import java.util.Date; import java.util.Date;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
@ -260,7 +259,7 @@ public class Response {
if (docDate == null) docDate = this.responseHeader.date(); if (docDate == null) docDate = this.responseHeader.date();
} }
if (docDate == null && this.request != null) docDate = this.request.appdate(); if (docDate == null && this.request != null) docDate = this.request.appdate();
if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime()); if (docDate == null) docDate = new Date();
return docDate; return docDate;
} }
@ -372,7 +371,7 @@ public class Response {
if (date == null) return "stale_no_date_given_in_response"; if (date == null) return "stale_no_date_given_in_response";
try { try {
final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl, 8); // milliseconds to live final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl, 8); // milliseconds to live
if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) { if (System.currentTimeMillis() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control"); //System.out.println("***not indexed because cache-control");
return "stale_expired"; return "stale_expired";
} }
@ -461,8 +460,8 @@ public class Response {
if (!this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; } if (!this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; }
// parse date // parse date
Date d1, d2; Date d1, d2;
d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); } d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(); }
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); } d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(); }
// finally, we shall treat the cache as stale if the modification time is after the if-.. time // finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) { return false; } if (d2.after(d1)) { return false; }
} }
@ -501,9 +500,10 @@ public class Response {
// -expires in cached response // -expires in cached response
// the expires value gives us a very easy hint when the cache is stale // the expires value gives us a very easy hint when the cache is stale
final Date expires = this.responseHeader.expires(); final Date expires = this.responseHeader.expires();
final Date now = new Date();
if (expires != null) { if (expires != null) {
// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url); // System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
if (expires.before(new Date(GenericFormatter.correctedUTCTime()))) { return false; } if (expires.before(now)) { return false; }
} }
final Date lastModified = this.responseHeader.lastModified(); final Date lastModified = this.responseHeader.lastModified();
cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL); cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL);
@ -517,13 +517,13 @@ public class Response {
// file may only be treated as fresh for one more month, not more. // file may only be treated as fresh for one more month, not more.
Date date = this.responseHeader.date(); Date date = this.responseHeader.date();
if (lastModified != null) { if (lastModified != null) {
if (date == null) { date = new Date(GenericFormatter.correctedUTCTime()); } if (date == null) { date = now; }
final long age = date.getTime() - lastModified.getTime(); final long age = date.getTime() - lastModified.getTime();
if (age < 0) { return false; } if (age < 0) { return false; }
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10 // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime() // the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
// therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10 // therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
if (GenericFormatter.correctedUTCTime() - date.getTime() > age / 10) { return false; } if (now.getTime() - date.getTime() > age / 10) { return false; }
} }
// -cache-control in cached response // -cache-control in cached response
@ -542,7 +542,7 @@ public class Response {
if (date == null) { return false; } if (date == null) { return false; }
try { try {
final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl, 8); // milliseconds to live final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl, 8); // milliseconds to live
if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) { if (now.getTime() - date.getTime() > ttl) {
return false; return false;
} }
} catch (final Exception e) { } catch (final Exception e) {
@ -626,12 +626,11 @@ public class Response {
// -if-modified-since in request // -if-modified-since in request
// if the page is fresh at the very moment we can index it // if the page is fresh at the very moment we can index it
final Date ifModifiedSince = this.ifModifiedSince(); final Date ifModifiedSince = this.ifModifiedSince();
final Date now = new Date();
if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) { if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
// parse date // parse date
Date d = this.responseHeader.lastModified(); Date d = this.responseHeader.lastModified();
if (d == null) { if (d == null) d = now;
d = new Date(GenericFormatter.correctedUTCTime());
}
// finally, we shall treat the cache as stale if the modification time is after the if-.. time // finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d.after(ifModifiedSince)) { if (d.after(ifModifiedSince)) {
//System.out.println("***not indexed because if-modified-since"); //System.out.println("***not indexed because if-modified-since");
@ -655,7 +654,7 @@ public class Response {
// sometimes, the expires date is set to the past to prevent that a page is cached // sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it // we use that information to see if we should index it
final Date expires = this.responseHeader.expires(); final Date expires = this.responseHeader.expires();
if (expires != null && expires.before(new Date(GenericFormatter.correctedUTCTime()))) { if (expires != null && expires.before(now)) {
return "Stale_(Expired)"; return "Stale_(Expired)";
} }
@ -688,7 +687,7 @@ public class Response {
} }
try { try {
final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl,8); // milliseconds to live final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl,8); // milliseconds to live
if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) { if (now.getTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control"); //System.out.println("***not indexed because cache-control");
return "Stale_(expired_by_cache-control)"; return "Stale_(expired_by_cache-control)";
} }
@ -865,7 +864,7 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try { try {
return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.depth(), this.content); return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
} catch (final Exception e) { } catch (final Exception e) {
return null; return null;
} }

@ -108,7 +108,8 @@ public class SitemapImporter extends Thread {
entry.url(), entry.url(),
entry.lastmod(new Date()), entry.lastmod(new Date()),
this.crawlingProfile.handle(), this.crawlingProfile.handle(),
0 0,
this.crawlingProfile.timezoneOffset()
)); ));
logger.info("New URL '" + entry.url() + "' added for loading."); logger.info("New URL '" + entry.url() + "' added for loading.");
} }

@ -210,7 +210,7 @@ public class BlogBoard {
} }
try { try {
date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(StrDate); date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(StrDate, 0).getTime();
} catch (final ParseException e1) { } catch (final ParseException e1) {
date = new Date(); date = new Date();
} }
@ -404,7 +404,7 @@ public class BlogBoard {
} }
return new Date(); return new Date();
} }
return GenericFormatter.SHORT_SECOND_FORMATTER.parse(date); return GenericFormatter.SHORT_SECOND_FORMATTER.parse(date, 0).getTime();
} catch (final ParseException ex) { } catch (final ParseException ex) {
return new Date(); return new Date();
} }

@ -139,7 +139,7 @@ public class BookmarkHelper {
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try { try {
//load the links //load the links
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper()); final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper(), 0);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, null, false); final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(input,writer); FileUtils.copy(input,writer);
@ -232,7 +232,7 @@ public class BookmarkHelper {
Date parsedDate = null; Date parsedDate = null;
try { try {
parsedDate = ISO8601Formatter.FORMATTER.parse(time); parsedDate = ISO8601Formatter.FORMATTER.parse(time, 0).getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
parsedDate = new Date(); parsedDate = new Date();
} }

@ -87,7 +87,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
} }
//get words from document //get words from document
final Map<String, Word> words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false).words(); final Map<String, Word> words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false, 0).words();
// generate potential tags from document title, description and subject // generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32; final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32;

@ -190,7 +190,8 @@ public class YMarkCrawlStart extends HashMap<String,String>{
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
null); // TODO: make this a default profile in CrawlSwitchboard null,
0); // TODO: make this a default profile in CrawlSwitchboard
sb.crawler.putActive(pe.handle().getBytes(), pe); sb.crawler.putActive(pe.handle().getBytes(), pe);
return sb.crawlStacker.stackCrawl(new Request( return sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(), sb.peers.mySeed().hash.getBytes(),
@ -198,7 +199,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
null, null,
"CRAWLING-ROOT", "CRAWLING-ROOT",
new Date(), new Date(),
pe.handle(), 0 pe.handle(), 0, pe.timezoneOffset()
)); ));
} }
} }

@ -97,7 +97,8 @@ public final class Condenser {
final boolean indexMedia, final boolean indexMedia,
final WordCache meaningLib, final WordCache meaningLib,
final boolean doAutotagging, final boolean doAutotagging,
final boolean findDatesInContent final boolean findDatesInContent,
final int timezoneOffset
) { ) {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
// if addMedia == true, then all the media links are also parsed and added to the words // if addMedia == true, then all the media links are also parsed and added to the words
@ -123,7 +124,7 @@ public final class Condenser {
Map.Entry<AnchorURL, String> entry; Map.Entry<AnchorURL, String> entry;
if (indexText) { if (indexText) {
String text = document.getTextString(); String text = document.getTextString();
if (findDatesInContent) this.dates_in_content = DateDetection.parse(text); if (findDatesInContent) this.dates_in_content = DateDetection.parse(text, timezoneOffset);
createCondensement(document.dc_source(), text, meaningLib, doAutotagging, scraper); createCondensement(document.dc_source(), text, meaningLib, doAutotagging, scraper);
// the phrase counter: // the phrase counter:
// phrase 0 are words taken from the URL // phrase 0 are words taken from the URL

@ -499,7 +499,7 @@ public class DateDetection {
* @param text * @param text
* @return a set of dates, ordered by time. first date in the ordered set is the oldest time. * @return a set of dates, ordered by time. first date in the ordered set is the oldest time.
*/ */
public static LinkedHashSet<Date> parse(String text) { public static LinkedHashSet<Date> parse(String text, int timezoneOffset) {
Long offset; Long offset;
if ((offset = specialDayOffset.get(text)) != null) { if ((offset = specialDayOffset.get(text)) != null) {
LinkedHashSet<Date> dates = new LinkedHashSet<>(); dates.add(new Date((System.currentTimeMillis() / AbstractFormatter.dayMillis) * AbstractFormatter.dayMillis + offset.longValue())); return dates; LinkedHashSet<Date> dates = new LinkedHashSet<>(); dates.add(new Date((System.currentTimeMillis() / AbstractFormatter.dayMillis) * AbstractFormatter.dayMillis + offset.longValue())); return dates;
@ -513,7 +513,7 @@ public class DateDetection {
return dates; return dates;
} }
public static Date parseLine(String text) { public static Date parseLine(final String text, final int timezoneOffset) {
Date d = null; Date d = null;
try {d = CONFORM.parse(text);} catch (ParseException e) {} try {d = CONFORM.parse(text);} catch (ParseException e) {}
//if (d == null) try {d = GenericFormatter.FORMAT_SHORT_DAY.parse(text);} catch (ParseException e) {} // did not work well and fired for wrong formats; do not use //if (d == null) try {d = GenericFormatter.FORMAT_SHORT_DAY.parse(text);} catch (ParseException e) {} // did not work well and fired for wrong formats; do not use
@ -521,7 +521,7 @@ public class DateDetection {
if (d == null) try {d = GenericFormatter.FORMAT_ANSIC.parse(text);} catch (ParseException e) {} if (d == null) try {d = GenericFormatter.FORMAT_ANSIC.parse(text);} catch (ParseException e) {}
if (d == null) { if (d == null) {
Set<Date> dd = parse(text); Set<Date> dd = parse(text, timezoneOffset);
if (dd.size() >= 1) d = dd.iterator().next(); if (dd.size() >= 1) d = dd.iterator().next();
} }
return d; return d;
@ -601,7 +601,7 @@ public class DateDetection {
}; };
long t = System.currentTimeMillis(); long t = System.currentTimeMillis();
for (String s: test) { for (String s: test) {
String parsed = parse(fill + " " + s + " " + fill).toString(); String parsed = parse(fill + " " + s + " " + fill, 0).toString();
System.out.println("SOURCE: " + s); System.out.println("SOURCE: " + s);
System.out.println("DATE : " + parsed); System.out.println("DATE : " + parsed);
System.out.println(); System.out.println();

@ -59,6 +59,7 @@ public interface Parser {
String mimeType, String mimeType,
String charset, String charset,
VocabularyScraper scraper, VocabularyScraper scraper,
int timezoneOffset,
InputStream source InputStream source
) throws Parser.Failure, InterruptedException; ) throws Parser.Failure, InterruptedException;

@ -167,6 +167,7 @@ public final class TextParser {
final String mimeType, final String mimeType,
final String charset, final String charset,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset,
final int depth, final int depth,
final File sourceFile final File sourceFile
) throws InterruptedException, Parser.Failure { ) throws InterruptedException, Parser.Failure {
@ -181,7 +182,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location); throw new Parser.Failure(errorMsg, location);
} }
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, scraper, depth, sourceFile.length(), sourceStream); docs = parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -199,6 +200,7 @@ public final class TextParser {
String mimeType, String mimeType,
final String charset, final String charset,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset,
final int depth, final int depth,
final byte[] content final byte[] content
) throws Parser.Failure { ) throws Parser.Failure {
@ -214,7 +216,7 @@ public final class TextParser {
} }
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true); assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, content); Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content);
return docs; return docs;
} }
@ -224,6 +226,7 @@ public final class TextParser {
String mimeType, String mimeType,
final String charset, final String charset,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset,
final int depth, final int depth,
final long contentLength, final long contentLength,
final InputStream sourceStream final InputStream sourceStream
@ -244,7 +247,7 @@ public final class TextParser {
// then we use only one stream-oriented parser. // then we use only one stream-oriented parser.
if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
// use a specific stream-oriented parser // use a specific stream-oriented parser
return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, sourceStream); return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, timezoneOffset, sourceStream);
} }
// in case that we know more parsers we first transform the content into a byte[] and use that as base // in case that we know more parsers we first transform the content into a byte[] and use that as base
@ -255,7 +258,7 @@ public final class TextParser {
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
} }
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, b); Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b);
return docs; return docs;
} }
@ -266,6 +269,7 @@ public final class TextParser {
final Parser parser, final Parser parser,
final String charset, final String charset,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream sourceStream final InputStream sourceStream
) throws Parser.Failure { ) throws Parser.Failure {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream"); if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
@ -275,7 +279,7 @@ public final class TextParser {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
try { try {
final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, sourceStream); final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream);
return docs; return docs;
} catch (final Exception e) { } catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location); throw new Parser.Failure("parser failed: " + parser.getName(), location);
@ -288,6 +292,7 @@ public final class TextParser {
final Set<Parser> parsers, final Set<Parser> parsers,
final String charset, final String charset,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset,
final int depth, final int depth,
final byte[] sourceArray final byte[] sourceArray
) throws Parser.Failure { ) throws Parser.Failure {
@ -310,7 +315,7 @@ public final class TextParser {
bis = new ByteArrayInputStream(sourceArray); bis = new ByteArrayInputStream(sourceArray);
} }
try { try {
docs = parser.parse(location, mimeType, documentCharset, scraper, bis); docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
failedParser.put(parser, e); failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);

@ -107,7 +107,7 @@ public class DCEntry extends MultiMapSolrParams {
if (d == null) return null; if (d == null) return null;
if (d.isEmpty()) return null; if (d.isEmpty()) return null;
try { try {
Date x = ISO8601Formatter.FORMATTER.parse(d); Date x = ISO8601Formatter.FORMATTER.parse(d, 0).getTime();
Date now = new Date(); Date now = new Date();
return x.after(now) ? now : x; return x.after(now) ? now : x;
} catch (final ParseException e) { } catch (final ParseException e) {

@ -524,7 +524,7 @@ public class MediawikiImporter extends Thread implements Importer {
public void genDocument() throws Parser.Failure { public void genDocument() throws Parser.Failure {
try { try {
this.url = new AnchorURL(this.urlStub + this.title); this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 1, UTF8.getBytes(this.html)); final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed); this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here // the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title); this.document.setTitle(this.title);

@ -158,7 +158,7 @@ public class ResumptionToken extends TreeMap<String, String> {
final String d = get("expirationDate"); final String d = get("expirationDate");
if (d == null) return null; if (d == null) return null;
try { try {
return ISO8601Formatter.FORMATTER.parse(d); return ISO8601Formatter.FORMATTER.parse(d, 0).getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
return new Date(); return new Date();

@ -54,7 +54,13 @@ public class apkParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
/* /*
* things to discover: * things to discover:

@ -70,8 +70,13 @@ public class audioTagParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
String filename = location.getFileName(); String filename = location.getFileName();

@ -38,13 +38,19 @@ public class AugmentParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(AnchorURL url, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, scraper, source); Document[] htmlDocs = this.rdfaParser.parse(location, mimeType, charset, scraper, timezoneOffset, source);
for (final Document doc : htmlDocs) { for (final Document doc : htmlDocs) {
/* analyze(doc, url, mimeType, charset); // enrich document text */ /* analyze(doc, url, mimeType, charset); // enrich document text */
parseAndAugment(doc, url, mimeType, charset); // enrich document with additional tags parseAndAugment(doc, location, mimeType, charset); // enrich document with additional tags
} }
return htmlDocs; return htmlDocs;
} }

@ -57,8 +57,13 @@ public class bzipParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;
@ -95,7 +100,7 @@ public class bzipParser extends AbstractParser implements Parser {
out.close(); out.close();
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile); docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -53,7 +53,13 @@ public class csvParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
// construct a document using all cells of the document // construct a document using all cells of the document
// the first row is used as headline // the first row is used as headline
// all lines are artificially terminated by a '.' to separate them as sentence for the condenser. // all lines are artificially terminated by a '.' to separate them as sentence for the condenser.

@ -59,8 +59,13 @@ public class docParser extends AbstractParser implements Parser {
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
final WordExtractor extractor; final WordExtractor extractor;

@ -61,7 +61,13 @@ public class dwgParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser // check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, true)) if (!MemoryControl.request(200 * 1024 * 1024, true))

@ -46,8 +46,13 @@ public class genericParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source1) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
String filename = location.getFileName(); String filename = location.getFileName();
final Document[] docs = new Document[]{new Document( final Document[] docs = new Document[]{new Document(

@ -56,7 +56,13 @@ public class gzipParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;
Document[] docs = null; Document[] docs = null;
@ -80,7 +86,7 @@ public class gzipParser extends AbstractParser implements Parser {
out.close(); out.close();
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile); docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -188,6 +188,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private AnchorURL canonical, publisher; private AnchorURL canonical, publisher;
private final int maxLinks; private final int maxLinks;
private final VocabularyScraper vocabularyScraper; private final VocabularyScraper vocabularyScraper;
private final int timezoneOffset;
private int breadcrumbs; private int breadcrumbs;
@ -213,7 +214,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param classDetector a map from class names to vocabulary names to scrape content from the DOM with associated class name * @param classDetector a map from class names to vocabulary names to scrape content from the DOM with associated class name
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper) { public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
// the root value here will not be used to load the resource. // the root value here will not be used to load the resource.
// it is only the reference for relative links // it is only the reference for relative links
super(linkTags0, linkTags1); super(linkTags0, linkTags1);
@ -221,6 +222,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.root = root; this.root = root;
this.maxLinks = maxLinks; this.maxLinks = maxLinks;
this.vocabularyScraper = vocabularyScraper; this.vocabularyScraper = vocabularyScraper;
this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation(); this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks); this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
this.css = new SizeLimitedMap<DigestURL, String>(maxLinks); this.css = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -389,12 +391,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (content != null) { if (content != null) {
if ("startDate".equals(itemprop)) try { if ("startDate".equals(itemprop)) try {
// parse ISO 8601 date // parse ISO 8601 date
Date startDate = ISO8601Formatter.FORMATTER.parse(content); Date startDate = ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();
this.startDates.add(startDate); this.startDates.add(startDate);
} catch (ParseException e) {} } catch (ParseException e) {}
if ("endDate".equals(itemprop)) try { if ("endDate".equals(itemprop)) try {
// parse ISO 8601 date // parse ISO 8601 date
Date endDate = ISO8601Formatter.FORMATTER.parse(content); Date endDate = ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();
this.endDates.add(endDate); this.endDates.add(endDate);
} catch (ParseException e) {} } catch (ParseException e) {}
} }
@ -651,7 +653,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// start a new scraper to parse links inside this text // start a new scraper to parse links inside this text
// parsing the content // parsing the content
final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper); final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper, this.timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false); final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
try { try {
FileUtils.copy(new CharArrayReader(inlineHtml), writer); FileUtils.copy(new CharArrayReader(inlineHtml), writer);
@ -1003,19 +1005,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// <meta name="date" content="YYYY-MM-DD..." /> // <meta name="date" content="YYYY-MM-DD..." />
content = this.metas.get("date"); content = this.metas.get("date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {}
// <meta name="DC.date" content="YYYY-MM-DD" /> // <meta name="DC.date" content="YYYY-MM-DD" />
content = this.metas.get("dc.date"); content = this.metas.get("dc.date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {}
// <meta name="DC:date" content="YYYY-MM-DD" /> // <meta name="DC:date" content="YYYY-MM-DD" />
content = this.metas.get("dc:date"); content = this.metas.get("dc:date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {}
// <meta http-equiv="last-modified" content="YYYY-MM-DD" /> // <meta http-equiv="last-modified" content="YYYY-MM-DD" />
content = this.metas.get("last-modified"); content = this.metas.get("last-modified");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {} if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {}
return new Date(); return new Date();
} }
@ -1153,19 +1155,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
} }
public static ContentScraper parseResource(final File file, final int maxLinks) throws IOException { public static ContentScraper parseResource(final File file, final int maxLinks, final int timezoneOffset) throws IOException {
// load page // load page
final byte[] page = FileUtils.read(file); final byte[] page = FileUtils.read(file);
if (page == null) throw new IOException("no content in file " + file.toString()); if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset // scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks); final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks, timezoneOffset);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close(); htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString(); if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content // scrape content
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper()); final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false); final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close(); writer.close();

@ -64,13 +64,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
final DigestURL rooturl, final DigestURL rooturl,
final Transformer transformer, final Transformer transformer,
final boolean passbyIfBinarySuspect, final boolean passbyIfBinarySuspect,
final int maxLinks final int maxLinks,
final int timezoneOffset
) { ) {
// create a input stream for buffereing // create a input stream for buffereing
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize); this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
this.bufferedIn.mark((int) preBufferSize); this.bufferedIn.mark((int) preBufferSize);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper); final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper, timezoneOffset);
scraper.registerHtmlFilterEventListener(this); scraper.registerHtmlFilterEventListener(this);
try { try {

@ -87,13 +87,15 @@ public class htmlParser extends AbstractParser implements Parser {
public Document[] parse( public Document[] parse(
final AnchorURL location, final AnchorURL location,
final String mimeType, final String mimeType,
final String documentCharset, final VocabularyScraper vocscraper, final String documentCharset,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure, InterruptedException {
try { try {
// first get a document from the parsed html // first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, sourceStream, maxLinks); final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
// parseToScraper also detects/corrects/sets charset from html content tag // parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
@ -151,7 +153,7 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd; return ppd;
} }
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, String input, int maxLinks) throws IOException { public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream; InputStream sourceStream;
try { try {
@ -161,7 +163,7 @@ public class htmlParser extends AbstractParser implements Parser {
} }
ContentScraper scraper; ContentScraper scraper;
try { try {
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, sourceStream, maxLinks); scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
} catch (Failure e) { } catch (Failure e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} }
@ -173,6 +175,7 @@ public class htmlParser extends AbstractParser implements Parser {
final String documentCharset, final String documentCharset,
final VocabularyScraper vocabularyScraper, final VocabularyScraper vocabularyScraper,
Charset[] detectedcharsetcontainer, Charset[] detectedcharsetcontainer,
final int timezoneOffset,
InputStream sourceStream, InputStream sourceStream,
final int maxLinks) throws Parser.Failure, IOException { final int maxLinks) throws Parser.Failure, IOException {
@ -188,7 +191,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (charset == null) { if (charset == null) {
ScraperInputStream htmlFilter = null; ScraperInputStream htmlFilter = null;
try { try {
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks); htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks, timezoneOffset);
sourceStream = htmlFilter; sourceStream = htmlFilter;
charset = htmlFilter.detectCharset(); charset = htmlFilter.detectCharset();
} catch (final IOException e1) { } catch (final IOException e1) {
@ -222,7 +225,7 @@ public class htmlParser extends AbstractParser implements Parser {
} }
// parsing the content // parsing the content
final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper); final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try { try {
FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]); FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
@ -324,7 +327,7 @@ public class htmlParser extends AbstractParser implements Parser {
try { try {
url = new AnchorURL(args[0]); url = new AnchorURL(args[0]);
final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null); final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null);
final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), new ByteArrayInputStream(content)); final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), 0, new ByteArrayInputStream(content));
final String title = document[0].dc_title(); final String title = document[0].dc_title();
System.out.println(title); System.out.println(title);
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {

@ -93,8 +93,10 @@ public class genericImageParser extends AbstractParser implements Parser {
public Document[] parse( public Document[] parse(
final AnchorURL location, final AnchorURL location,
final String mimeType, final String mimeType,
final String documentCharset, final VocabularyScraper scraper, final String charset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
ImageInfo ii = null; ImageInfo ii = null;
String title = null; String title = null;
@ -108,7 +110,7 @@ public class genericImageParser extends AbstractParser implements Parser {
if (mimeType.equals("image/bmp") || ext.equals("bmp")) { if (mimeType.equals("image/bmp") || ext.equals("bmp")) {
byte[] b; byte[] b;
try { try {
b = FileUtils.read(sourceStream); b = FileUtils.read(source);
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
@ -126,7 +128,7 @@ public class genericImageParser extends AbstractParser implements Parser {
// a tutorial is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/sampleUsage.html // a tutorial is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/sampleUsage.html
byte[] b; byte[] b;
try { try {
b = FileUtils.read(sourceStream); b = FileUtils.read(source);
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
@ -182,7 +184,7 @@ public class genericImageParser extends AbstractParser implements Parser {
// just ignore // just ignore
} }
} else { } else {
ii = parseJavaImage(location, sourceStream); ii = parseJavaImage(location, source);
} }
final HashSet<String> languages = new HashSet<String>(); final HashSet<String> languages = new HashSet<String>();
@ -315,7 +317,7 @@ public class genericImageParser extends AbstractParser implements Parser {
AnchorURL uri; AnchorURL uri;
try { try {
uri = new AnchorURL("http://localhost/" + image.getName()); uri = new AnchorURL("http://localhost/" + image.getName());
final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), new FileInputStream(image)); final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), 0, new FileInputStream(image));
System.out.println(document[0].toString()); System.out.println(document[0].toString());
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {
e.printStackTrace(); e.printStackTrace();

@ -87,8 +87,10 @@ public class metadataImageParser extends AbstractParser implements Parser {
public Document[] parse( public Document[] parse(
final AnchorURL location, final AnchorURL location,
final String mimeType, final String mimeType,
final String documentCharset, final VocabularyScraper scraper, final String charset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
String title = null; String title = null;
String author = null; String author = null;
@ -99,7 +101,7 @@ public class metadataImageParser extends AbstractParser implements Parser {
StringBuilder imgInfotxt = new StringBuilder(); StringBuilder imgInfotxt = new StringBuilder();
try { try {
final Metadata metadata = ImageMetadataReader.readMetadata(new BufferedInputStream(sourceStream)); final Metadata metadata = ImageMetadataReader.readMetadata(new BufferedInputStream(source));
final Iterator<Directory> directories = metadata.getDirectories().iterator(); final Iterator<Directory> directories = metadata.getDirectories().iterator();
final HashMap<String, String> props = new HashMap<String, String>(); final HashMap<String, String> props = new HashMap<String, String>();
@ -160,7 +162,7 @@ public class metadataImageParser extends AbstractParser implements Parser {
return new Document[]{new Document( return new Document[]{new Document(
location, location,
mimeType, mimeType,
documentCharset, charset,
this, this,
new HashSet<String>(0), // languages new HashSet<String>(0), // languages
keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords

@ -59,11 +59,16 @@ public class linkScraperParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("text/sgml"); this.SUPPORTED_MIME_TYPES.add("text/sgml");
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, source); Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, timezoneOffset, source);
Document htmlParserDoc = htmlParserDocs == null ? null : Document.mergeDocuments(location, mimeType, htmlParserDocs); Document htmlParserDoc = htmlParserDocs == null ? null : Document.mergeDocuments(location, mimeType, htmlParserDocs);

@ -71,8 +71,13 @@ public class mmParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException throws Parser.Failure, InterruptedException
{ {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();

@ -216,7 +216,13 @@ public class odtParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null; File dest = null;
try { try {
// creating a tempfile // creating a tempfile

@ -202,7 +202,13 @@ public class ooxmlParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null; File dest = null;
try { try {
// creating a tempfile // creating a tempfile

@ -86,7 +86,13 @@ public class pdfParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser // check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false)) if (!MemoryControl.request(200 * 1024 * 1024, false))
@ -376,7 +382,7 @@ public class pdfParser extends AbstractParser implements Parser {
final AbstractParser parser = new pdfParser(); final AbstractParser parser = new pdfParser();
Document document = null; Document document = null;
try { try {
document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), new FileInputStream(pdfFile))); document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), 0, new FileInputStream(pdfFile)));
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
System.err.println("Cannot parse file " + pdfFile.getAbsolutePath()); System.err.println("Cannot parse file " + pdfFile.getAbsolutePath());
ConcurrentLog.logException(e); ConcurrentLog.logException(e);

@ -64,8 +64,13 @@ public class pptParser extends AbstractParser implements Parser {
* all extracted information about the parsed document * all extracted information about the parsed document
*/ */
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure,
InterruptedException { InterruptedException {
try { try {
/* /*

@ -258,8 +258,13 @@ public class psParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;

@ -46,8 +46,13 @@ public class rdfParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException { throws Failure, InterruptedException {
@ -60,7 +65,7 @@ public class rdfParser extends AbstractParser implements Parser {
Document doc; Document doc;
String all = "rdfdatasource"; String all = "rdfdatasource";
doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "", doc = new Document(location, mimeType, charset, null, null, null, singleList(""), "",
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false, new Date()); "", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false, new Date());
docs.add(doc); docs.add(doc);

@ -48,11 +48,16 @@ public class RDFaParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(AnchorURL url, String mimeType, public Document[] parse(
String charset, final VocabularyScraper scraper, InputStream source) throws Failure, final AnchorURL url,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Failure,
InterruptedException { InterruptedException {
Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, source); Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, timezoneOffset, source);
// TODO: current hardcoded restriction: apply rdfa parser only on selected sources. // TODO: current hardcoded restriction: apply rdfa parser only on selected sources.
@ -97,13 +102,18 @@ public class RDFaParser extends AbstractParser implements Parser {
return doc; return doc;
} }
private Document[] parseHtml(AnchorURL url, String mimeType, private Document[] parseHtml(
String charset, VocabularyScraper scraper, InputStream source) throws Failure, final AnchorURL url,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Failure,
InterruptedException { InterruptedException {
Document[] htmlDocs = null; Document[] htmlDocs = null;
try { try {
htmlDocs = this.hp.parse(url, mimeType, charset, scraper, source); htmlDocs = this.hp.parse(url, mimeType, charset, scraper, timezoneOffset, source);
source.reset(); source.reset();
} catch (final IOException e1) { } catch (final IOException e1) {
@ -180,7 +190,7 @@ public class RDFaParser extends AbstractParser implements Parser {
if (aReader != null) { if (aReader != null) {
RDFaParser aParser = new RDFaParser(); RDFaParser aParser = new RDFaParser();
try { try {
aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), aURL.openStream()); aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), 0, aURL.openStream());
} catch (final FileNotFoundException e) { } catch (final FileNotFoundException e) {
e.printStackTrace(); e.printStackTrace();
} catch (final IOException e) { } catch (final IOException e) {

@ -59,14 +59,19 @@ public class rssParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL feedurl, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException { throws Failure, InterruptedException {
RSSReader rssReader; RSSReader rssReader;
try { try {
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source); rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source);
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure("Load error:" + e.getMessage(), feedurl, e); throw new Parser.Failure("Load error:" + e.getMessage(), location, e);
} }
final RSSFeed feed = rssReader.getFeed(); final RSSFeed feed = rssReader.getFeed();

@ -53,8 +53,13 @@ public class rtfParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
try { try {

@ -56,7 +56,12 @@ public class sevenzipParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed"); this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
} }
public Document parse(final AnchorURL location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException { public Document parse(
final AnchorURL location,
final String mimeType,
final String charset,
final int timezoneOffset,
final IInStream source) throws Parser.Failure, InterruptedException {
final Document doc = new Document( final Document doc = new Document(
location, location,
mimeType, mimeType,
@ -83,7 +88,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location); throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location);
} }
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile()); final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), timezoneOffset);
AbstractParser.log.fine("processing archive contents..."); AbstractParser.log.fine("processing archive contents...");
try { try {
archive.Extract(null, -1, 0, aec); archive.Extract(null, -1, 0, aec);
@ -101,16 +106,27 @@ public class sevenzipParser extends AbstractParser implements Parser {
} }
} }
public Document parse(final AnchorURL location, final String mimeType, final String charset, final byte[] source) throws Parser.Failure, InterruptedException { public Document parse(
return parse(location, mimeType, charset, new ByteArrayIInStream(source)); final AnchorURL location,
final String mimeType,
final String charset,
final int timezoneOffset,
final byte[] source) throws Parser.Failure, InterruptedException {
return parse(location, mimeType, charset, timezoneOffset, new ByteArrayIInStream(source));
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
try { try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos); FileUtils.copy(source, cfos);
return new Document[]{parse(location, mimeType, charset, cfos.toByteArray())}; return new Document[]{parse(location, mimeType, charset, timezoneOffset, cfos.toByteArray())};
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location); throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location);
} }
@ -124,13 +140,19 @@ public class sevenzipParser extends AbstractParser implements Parser {
private ByteArrayOutputStream cfos = null; private ByteArrayOutputStream cfos = null;
private final Document doc; private final Document doc;
private final String prefix; private final String prefix;
private final int timezoneOffset;
public SZParserExtractCallback(final ConcurrentLog logger, final IInArchive handler, public SZParserExtractCallback(
final Document doc, final String prefix) { final ConcurrentLog logger,
final IInArchive handler,
final Document doc,
final String prefix,
final int timezoneOffset) {
super.Init(handler); super.Init(handler);
this.log = logger; this.log = logger;
this.doc = doc; this.doc = doc;
this.prefix = prefix; this.prefix = prefix;
this.timezoneOffset = timezoneOffset;
} }
@Override @Override
@ -172,7 +194,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// below for reversion of the effects // below for reversion of the effects
final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath); final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), this.doc.getDepth() + 1, this.cfos.toByteArray()); theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs); this.doc.addSubDocuments(theDocs);
} }

@ -58,8 +58,13 @@ public class sidAudioParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
try { try {
final int available = source.available(); final int available = source.available();

@ -70,8 +70,13 @@ public class sitemapParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException { throws Failure, InterruptedException {
final List<Document> docs = new ArrayList<Document>(); final List<Document> docs = new ArrayList<Document>();
SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent); SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent);
@ -83,7 +88,7 @@ public class sitemapParser extends AbstractParser implements Parser {
uri = new DigestURL(item.loc); uri = new DigestURL(item.loc);
doc = new Document( doc = new Document(
uri, uri,
TextParser.mimeOf(url), TextParser.mimeOf(location),
charset, charset,
this, this,
null, null,
@ -224,7 +229,7 @@ public class sitemapParser extends AbstractParser implements Parser {
public Date lastmod(final Date dflt) { public Date lastmod(final Date dflt) {
try { try {
return ISO8601Formatter.FORMATTER.parse(this.lastmod); return ISO8601Formatter.FORMATTER.parse(this.lastmod, 0).getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
return dflt; return dflt;
} }
@ -245,7 +250,7 @@ public class sitemapParser extends AbstractParser implements Parser {
public Date lastmod(final Date dflt) { public Date lastmod(final Date dflt) {
try { try {
return ISO8601Formatter.FORMATTER.parse(this.lastmod); return ISO8601Formatter.FORMATTER.parse(this.lastmod, 0).getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
return dflt; return dflt;
} }

@ -56,8 +56,13 @@ public class swfParser extends AbstractParser implements Parser {
* all extracted information about the parsed document * all extracted information about the parsed document
*/ */
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException throws Parser.Failure, InterruptedException
{ {

@ -62,16 +62,22 @@ public class tarParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
InputStream source) throws Parser.Failure, InterruptedException {
final List<Document> docacc = new ArrayList<Document>(); final List<Document> docacc = new ArrayList<Document>();
Document[] subDocs = null; Document[] subDocs = null;
final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); final String ext = MultiProtocolURL.getFileExtension(location.getFileName());
if (ext.equals("gz") || ext.equals("tgz")) { if (ext.equals("gz") || ext.equals("tgz")) {
try { try {
source = new GZIPInputStream(source); source = new GZIPInputStream(source);
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure("tar parser: " + e.getMessage(), url); throw new Parser.Failure("tar parser: " + e.getMessage(), location);
} }
} }
TarEntry entry; TarEntry entry;
@ -91,7 +97,7 @@ public class tarParser extends AbstractParser implements Parser {
try { try {
tmp = FileUtils.createTempFile(this.getClass(), name); tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize()); FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, scraper, 999, tmp); subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp);
if (subDocs == null) continue; if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d); for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {

@ -57,7 +57,13 @@ public class torrentParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
byte[] b = null; byte[] b = null;
try { try {
@ -120,8 +126,8 @@ public class torrentParser extends AbstractParser implements Parser {
try { try {
byte[] b = FileUtils.read(new File(args[0])); byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser(); torrentParser parser = new torrentParser();
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), new ByteArrayInputStream(b)); Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), 0, new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false); Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false, 0);
Map<String, Word> w = c.words(); Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (final IOException e) { } catch (final IOException e) {

@ -66,7 +66,13 @@ public class vcfParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
try { try {
@ -201,7 +207,7 @@ public class vcfParser extends AbstractParser implements Parser {
} else { } else {
if (AbstractParser.log.isFinest()) AbstractParser.log.finest("Invalid data in vcf file" + if (AbstractParser.log.isFinest()) AbstractParser.log.finest("Invalid data in vcf file" +
"\n\tURL: " + url + "\n\tURL: " + location +
"\n\tLine: " + line + "\n\tLine: " + line +
"\n\tLine-Nr: " + lineNr); "\n\tLine-Nr: " + lineNr);
} }
@ -212,7 +218,7 @@ public class vcfParser extends AbstractParser implements Parser {
final byte[] text = UTF8.getBytes(parsedDataText.toString()); final byte[] text = UTF8.getBytes(parsedDataText.toString());
final List<String> descriptions = new ArrayList<String>(1); descriptions.add("vCard"); final List<String> descriptions = new ArrayList<String>(1); descriptions.add("vCard");
return new Document[]{new Document( return new Document[]{new Document(
url, // url of the source document location, // url of the source document
mimeType, // the documents mime type mimeType, // the documents mime type
null, // charset null, // charset
this, this,
@ -234,7 +240,7 @@ public class vcfParser extends AbstractParser implements Parser {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;
throw new Parser.Failure("Unexpected error while parsing vcf resource. " + e.getMessage(),url); throw new Parser.Failure("Unexpected error while parsing vcf resource. " + e.getMessage(), location);
} }
} }

@ -67,7 +67,13 @@ public class vsdParser extends AbstractParser implements Parser {
* all extracted information about the parsed document * all extracted information about the parsed document
*/ */
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
Document theDoc = null; Document theDoc = null;

@ -68,8 +68,13 @@ public class xlsParser extends AbstractParser implements Parser {
* all extracted information about the parsed document * all extracted information about the parsed document
*/ */
@Override @Override
public Document[] parse(final AnchorURL location, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure,
InterruptedException { InterruptedException {
return new XLSHSSFListener().parse(location, mimeType, charset, source); return new XLSHSSFListener().parse(location, mimeType, charset, source);
} }

@ -62,12 +62,17 @@ public class zipParser extends AbstractParser implements Parser {
} }
@Override @Override
public Document[] parse(final AnchorURL url, final String mimeType, public Document[] parse(
final String charset, final VocabularyScraper scraper, final InputStream source) final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
// check memory for parser // check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false)) if (!MemoryControl.request(200 * 1024 * 1024, false))
throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), url); throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), location);
Document[] docs = null; Document[] docs = null;
final List<Document> docacc = new ArrayList<Document>(); final List<Document> docacc = new ArrayList<Document>();
@ -88,9 +93,9 @@ public class zipParser extends AbstractParser implements Parser {
try { try {
tmp = FileUtils.createTempFile(this.getClass(), name); tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(zis, tmp, entry.getSize()); FileUtils.copy(zis, tmp, entry.getSize());
final DigestURL virtualURL = DigestURL.newURL(url, "#" + name); final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, 999, tmp); docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp);
if (docs == null) continue; if (docs == null) continue;
for (final Document d: docs) docacc.add(d); for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {

@ -74,7 +74,8 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler
"", "",
cachedResponseHeader.lastModified(), cachedResponseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0); 0,
sb.crawler.defaultProxyProfile.timezoneOffset());
final Response cachedResponse = new Response( final Response cachedResponse = new Response(
yacyRequest, yacyRequest,

@ -180,7 +180,8 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
"", "",
responseHeaderLegacy.lastModified(), responseHeaderLegacy.lastModified(),
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); 0,
sb.crawler.defaultProxyProfile.timezoneOffset()); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete);
final Response yacyResponse = new Response( final Response yacyResponse = new Response(
yacyRequest, yacyRequest,
null, null,

@ -137,7 +137,7 @@ public class SolrSelectServlet extends HttpServlet {
if (!mmsp.getMap().containsKey(CommonParams.Q) && mmsp.getMap().containsKey(CommonParams.QUERY)) { if (!mmsp.getMap().containsKey(CommonParams.Q) && mmsp.getMap().containsKey(CommonParams.QUERY)) {
querystring = mmsp.get(CommonParams.QUERY, ""); querystring = mmsp.get(CommonParams.QUERY, "");
mmsp.getMap().remove(CommonParams.QUERY); mmsp.getMap().remove(CommonParams.QUERY);
QueryModifier modifier = new QueryModifier(); QueryModifier modifier = new QueryModifier(0);
querystring = modifier.parse(querystring); querystring = modifier.parse(querystring);
modifier.apply(mmsp); modifier.apply(mmsp);
QueryGoal qg = new QueryGoal(querystring); QueryGoal qg = new QueryGoal(querystring);

@ -172,7 +172,7 @@ public class ArrayStack implements BLOB {
f.delete(); f.delete();
deletions = true; deletions = true;
} else try { } else try {
d = GenericFormatter.SHORT_SECOND_FORMATTER.parse(file.substring(0, 14)); d = GenericFormatter.SHORT_SECOND_FORMATTER.parse(file.substring(0, 14), 0).getTime();
f.renameTo(newBLOB(d)); f.renameTo(newBLOB(d));
deletions = true; deletions = true;
} catch (final ParseException e) {continue;} } catch (final ParseException e) {continue;}
@ -188,7 +188,7 @@ public class ArrayStack implements BLOB {
for (final String file : files) { for (final String file : files) {
if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) { if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) {
try { try {
d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18)); d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime();
time = d.getTime(); time = d.getTime();
if (time > maxtime) maxtime = time; if (time > maxtime) maxtime = time;
} catch (final ParseException e) {continue;} } catch (final ParseException e) {continue;}
@ -199,7 +199,7 @@ public class ArrayStack implements BLOB {
for (final String file : files) { for (final String file : files) {
if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) { if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) {
try { try {
d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18)); d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime();
f = new File(heapLocation, file); f = new File(heapLocation, file);
time = d.getTime(); time = d.getTime();
try { try {
@ -253,7 +253,7 @@ public class ArrayStack implements BLOB {
public synchronized void mountBLOB(final File location, final boolean full) throws IOException { public synchronized void mountBLOB(final File location, final boolean full) throws IOException {
Date d; Date d;
try { try {
d = my_SHORT_MILSEC_FORMATTER.parse(location.getName().substring(this.prefix.length() + 1, this.prefix.length() + 18)); d = my_SHORT_MILSEC_FORMATTER.parse(location.getName().substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
throw new IOException("date parse problem with file " + location.toString() + ": " + e.getMessage()); throw new IOException("date parse problem with file " + location.toString() + ": " + e.getMessage());
} }

@ -95,7 +95,7 @@ public class BEncodedHeapBag extends AbstractMapStore implements MapStore {
(element.length() == this.prefix.length() + 23)) { (element.length() == this.prefix.length() + 23)) {
f = new File(this.baseDir, element); f = new File(this.baseDir, element);
try { try {
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18)); d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
ConcurrentLog.severe("BEncodedHeapBag", "", e); ConcurrentLog.severe("BEncodedHeapBag", "", e);
continue; continue;
@ -203,7 +203,7 @@ public class BEncodedHeapBag extends AbstractMapStore implements MapStore {
final String name = heap.getFile().getName(); final String name = heap.getFile().getName();
long d; long d;
try { try {
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18)).getTime(); d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime().getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
ConcurrentLog.severe("BEncodedHeapBag", "", e); ConcurrentLog.severe("BEncodedHeapBag", "", e);
d = 0; d = 0;

@ -764,7 +764,7 @@ public class Tables implements Iterable<String> {
final byte[] r = this.get(colname); final byte[] r = this.get(colname);
if (r == null) return dflt; if (r == null) return dflt;
try { try {
return my_SHORT_MILSEC_FORMATTER.parse(UTF8.String(r)); return my_SHORT_MILSEC_FORMATTER.parse(UTF8.String(r), 0).getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
return dflt; return dflt;
} }

@ -107,17 +107,17 @@ public class URIMetadataNode extends SolrDocument {
final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute);
try { try {
this.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"))); this.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"), 0).getTime());
} catch (final ParseException e) { } catch (final ParseException e) {
this.setField(CollectionSchema.last_modified.name(), new Date()); this.setField(CollectionSchema.last_modified.name(), new Date());
} }
try { try {
this.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"))); this.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"), 0).getTime());
} catch (final ParseException e) { } catch (final ParseException e) {
this.setField(CollectionSchema.load_date_dt.name(), new Date()); this.setField(CollectionSchema.load_date_dt.name(), new Date());
} }
try { try {
this.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"))); this.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"), 0).getTime());
} catch (final ParseException e) { } catch (final ParseException e) {
this.setField(CollectionSchema.fresh_date_dt.name(), new Date()); this.setField(CollectionSchema.fresh_date_dt.name(), new Date());
} }

@ -179,7 +179,7 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
(element.length() == this.prefix.length() + 24)) { (element.length() == this.prefix.length() + 24)) {
f = new File(this.path, element); f = new File(this.path, element);
try { try {
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18)); d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
ConcurrentLog.severe("SplitTable", "", e); ConcurrentLog.severe("SplitTable", "", e);
continue; continue;
@ -372,7 +372,7 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
final String name = new File(table.filename()).getName(); final String name = new File(table.filename()).getName();
long d; long d;
try { try {
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18)).getTime(); d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime().getTime();
} catch (final ParseException e) { } catch (final ParseException e) {
ConcurrentLog.severe("SplitTable", "", e); ConcurrentLog.severe("SplitTable", "", e);
d = 0; d = 0;

@ -46,6 +46,8 @@ package net.yacy.peers;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.text.ParseException;
import java.util.Calendar;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
@ -164,10 +166,16 @@ public class NewsDB {
private Record b2r(final Row.Entry b) { private Record b2r(final Row.Entry b) {
if (b == null) return null; if (b == null) return null;
Calendar c;
try {
c = b.empty(2) ? null : my_SHORT_SECOND_FORMATTER.parse(b.getColASCII(2), 0);
} catch (ParseException e) {
c = null;
}
return new NewsDB.Record( return new NewsDB.Record(
b.getPrimaryKeyASCII(), b.getPrimaryKeyASCII(),
b.getColUTF8(1), b.getColUTF8(1),
(b.empty(2)) ? null : my_SHORT_SECOND_FORMATTER.parse(b.getColASCII(2), GenericFormatter.UTCDiffString()), c == null ? null : c.getTime(),
(int) b.getColLong(3), (int) b.getColLong(3),
MapTools.string2map(b.getColUTF8(4), ",") MapTools.string2map(b.getColUTF8(4), ",")
); );
@ -226,8 +234,8 @@ public class NewsDB {
public class Record { public class Record {
private final String originator; // hash of originating peer private final String originator; // hash of originating peer
private final Date created; // Date when news was created by originator private Date created; // Date when news was created by originator
private final Date received; // Date when news was received here at this peer private Date received; // Date when news was received here at this peer
private final String category; // keyword that addresses possible actions private final String category; // keyword that addresses possible actions
private int distributed; // counter that counts number of distributions of this news record private int distributed; // counter that counts number of distributions of this news record
private final Map<String, String> attributes; // elements of the news for a special category private final Map<String, String> attributes; // elements of the news for a special category
@ -238,8 +246,16 @@ public class NewsDB {
if (this.attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + this.attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")"); if (this.attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + this.attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")");
this.category = (this.attributes.containsKey("cat")) ? this.attributes.get("cat") : ""; this.category = (this.attributes.containsKey("cat")) ? this.attributes.get("cat") : "";
if (this.category.length() > NewsDB.categoryStringLength) throw new IllegalArgumentException("category length (" + this.category.length() + ") exceeds maximum (" + NewsDB.categoryStringLength + ")"); if (this.category.length() > NewsDB.categoryStringLength) throw new IllegalArgumentException("category length (" + this.category.length() + ") exceeds maximum (" + NewsDB.categoryStringLength + ")");
this.received = (this.attributes.containsKey("rec")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("rec"), GenericFormatter.UTCDiffString()) : new Date(); try {
this.created = (this.attributes.containsKey("cre")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("cre"), GenericFormatter.UTCDiffString()) : new Date(); this.received = (this.attributes.containsKey("rec")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("rec"), 0).getTime() : new Date();
} catch (ParseException e) {
this.received = new Date();
}
try {
this.created = (this.attributes.containsKey("cre")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("cre"), 0).getTime() : new Date();
} catch (ParseException e) {
this.created = new Date();
}
this.distributed = (this.attributes.containsKey("dis")) ? Integer.parseInt(this.attributes.get("dis")) : 0; this.distributed = (this.attributes.containsKey("dis")) ? Integer.parseInt(this.attributes.get("dis")) : 0;
this.originator = (this.attributes.containsKey("ori")) ? this.attributes.get("ori") : ""; this.originator = (this.attributes.containsKey("ori")) ? this.attributes.get("ori") : "";
removeStandards(); removeStandards();
@ -262,7 +278,11 @@ public class NewsDB {
if (attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")"); if (attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")");
this.attributes = attributes; this.attributes = attributes;
this.received = received; this.received = received;
this.created = my_SHORT_SECOND_FORMATTER.parse(id.substring(0, GenericFormatter.PATTERN_SHORT_SECOND.length()), GenericFormatter.UTCDiffString()); try {
this.created = my_SHORT_SECOND_FORMATTER.parse(id.substring(0, GenericFormatter.PATTERN_SHORT_SECOND.length()), 0).getTime();
} catch (ParseException e) {
this.created = new Date();
}
this.category = category; this.category = category;
this.distributed = distributed; this.distributed = distributed;
this.originator = id.substring(GenericFormatter.PATTERN_SHORT_SECOND.length()); this.originator = id.substring(GenericFormatter.PATTERN_SHORT_SECOND.length());

@ -797,7 +797,7 @@ public class Seed implements Cloneable, Comparable<Seed>, Comparator<Seed>
try { try {
final GenericFormatter my_SHORT_SECOND_FORMATTER = final GenericFormatter my_SHORT_SECOND_FORMATTER =
new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); // use our own formatter to prevent concurrency locks with other processes new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); // use our own formatter to prevent concurrency locks with other processes
final long t = my_SHORT_SECOND_FORMATTER.parse(get(Seed.LASTSEEN, "20040101000000")).getTime(); final long t = my_SHORT_SECOND_FORMATTER.parse(get(Seed.LASTSEEN, "20040101000000"), 0).getTime().getTime();
// getTime creates a UTC time number. But in this case java thinks, that the given // getTime creates a UTC time number. But in this case java thinks, that the given
// time string is a local time, which has a local UTC offset applied. // time string is a local time, which has a local UTC offset applied.
// Therefore java subtracts the local UTC offset, to get a UTC number. // Therefore java subtracts the local UTC offset, to get a UTC number.
@ -831,7 +831,7 @@ public class Seed implements Cloneable, Comparable<Seed>, Comparator<Seed>
try { try {
final GenericFormatter my_SHORT_SECOND_FORMATTER = final GenericFormatter my_SHORT_SECOND_FORMATTER =
new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); // use our own formatter to prevent concurrency locks with other processes new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); // use our own formatter to prevent concurrency locks with other processes
b = my_SHORT_SECOND_FORMATTER.parse(get(Seed.BDATE, "20040101000000")).getTime(); b = my_SHORT_SECOND_FORMATTER.parse(get(Seed.BDATE, "20040101000000"), 0).getTime().getTime();
} catch (final ParseException e ) { } catch (final ParseException e ) {
b = System.currentTimeMillis(); b = System.currentTimeMillis();
} }

@ -503,7 +503,7 @@ public class WebStructureGraph {
hr = hr =
new HostReference( new HostReference(
ASCII.getBytes(sentry.hosthash), ASCII.getBytes(sentry.hosthash),
GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(), GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date, 0).getTime().getTime(),
refhosthashandcounter.getValue().intValue()); refhosthashandcounter.getValue().intValue());
} catch (final ParseException e ) { } catch (final ParseException e ) {
continue refloop; continue refloop;

@ -112,21 +112,24 @@ public final class LoaderDispatcher {
final boolean forText, final boolean forText,
final boolean global final boolean global
) { ) {
CrawlProfile profile =
(forText) ?
((global) ?
this.sb.crawler.defaultTextSnippetGlobalProfile :
this.sb.crawler.defaultTextSnippetLocalProfile)
:
((global) ?
this.sb.crawler.defaultMediaSnippetGlobalProfile :
this.sb.crawler.defaultMediaSnippetLocalProfile);
return new Request( return new Request(
ASCII.getBytes(this.sb.peers.mySeed().hash), ASCII.getBytes(this.sb.peers.mySeed().hash),
url, url,
null, null,
"", "",
new Date(), new Date(),
(forText) ? profile.handle(),
((global) ? 0,
this.sb.crawler.defaultTextSnippetGlobalProfile.handle() : profile.timezoneOffset());
this.sb.crawler.defaultTextSnippetLocalProfile.handle())
:
((global) ?
this.sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
this.sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0);
} }
public void load(final DigestURL url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException { public void load(final DigestURL url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
@ -407,7 +410,7 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls * @return a map from URLs to the anchor texts of the urls
* @throws IOException * @throws IOException
*/ */
public final Map<AnchorURL, String> loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { public final Map<AnchorURL, String> loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent, final int timezoneOffset) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent); final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent);
if (response == null) throw new IOException("response == null"); if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader(); final ResponseHeader responseHeader = response.getResponseHeader();
@ -418,7 +421,7 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.mime()); final String supportError = TextParser.supports(url, responseHeader.mime());
if (supportError != null) throw new IOException("no parser support: " + supportError); if (supportError != null) throw new IOException("no parser support: " + supportError);
try { try {
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), response.depth(), response.getContent()); documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent());
if (documents == null) throw new IOException("document == null"); if (documents == null) throw new IOException("document == null");
} catch (final Exception e) { } catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage()); throw new IOException("parser error: " + e.getMessage());

@ -152,7 +152,7 @@ public class EventTracker {
} }
public long getTime() { public long getTime() {
if (this.time instanceof String) try { if (this.time instanceof String) try {
return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time).getTime(); return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time, 0).getTime().getTime();
} catch (ParseException e) { } catch (ParseException e) {
return -1L; return -1L;
} }
@ -162,7 +162,7 @@ public class EventTracker {
} }
public Date getDate() { public Date getDate() {
if (this.time instanceof String) try { if (this.time instanceof String) try {
return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time); return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time, 0).getTime();
} catch (ParseException e) { } catch (ParseException e) {
return null; return null;
}if (this.time instanceof Long) return new Date((Long) this.time); }if (this.time instanceof Long) return new Date((Long) this.time);

@ -1942,7 +1942,8 @@ public final class Switchboard extends serverSwitch {
"", "",
surrogate.getDate(), surrogate.getDate(),
this.crawler.defaultSurrogateProfile.handle(), this.crawler.defaultSurrogateProfile.handle(),
0); 0,
this.crawler.defaultSurrogateProfile.timezoneOffset());
response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false, null); response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false, null);
final IndexingQueueEntry queueEntry = final IndexingQueueEntry queueEntry =
new IndexingQueueEntry(response, new Document[] {document}, null); new IndexingQueueEntry(response, new Document[] {document}, null);
@ -2571,6 +2572,7 @@ public final class Switchboard extends serverSwitch {
response.getMimeType(), response.getMimeType(),
response.getCharacterEncoding(), response.getCharacterEncoding(),
response.profile().scraper(), response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(), response.depth(),
response.getContent()); response.getContent());
if ( documents == null ) { if ( documents == null ) {
@ -2673,7 +2675,8 @@ public final class Switchboard extends serverSwitch {
nextEntry.getValue(), nextEntry.getValue(),
new Date(), new Date(),
response.profile().handle(), response.profile().handle(),
nextdepth)); nextdepth,
response.profile().timezoneOffset()));
} catch (final MalformedURLException e ) { } catch (final MalformedURLException e ) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
@ -2754,7 +2757,8 @@ public final class Switchboard extends serverSwitch {
in.documents[i], in.queueEntry.profile().scraper(), in.queueEntry.profile().indexText(), in.documents[i], in.queueEntry.profile().scraper(), in.queueEntry.profile().indexText(),
in.queueEntry.profile().indexMedia(), in.queueEntry.profile().indexMedia(),
LibraryProvider.dymLib, true, LibraryProvider.dymLib, true,
this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts)); this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts),
profile.timezoneOffset());
// update image result list statistics // update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup // its good to do this concurrently here, because it needs a DNS lookup
@ -3043,7 +3047,15 @@ public final class Switchboard extends serverSwitch {
int p = userInfo == null ? -1 : userInfo.indexOf(':'); int p = userInfo == null ? -1 : userInfo.indexOf(':');
String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p); String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p);
String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1); String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1);
this.crawlStacker.enqueueEntriesFTP(this.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), user, pw, false); this.crawlStacker.enqueueEntriesFTP(
this.peers.mySeed().hash.getBytes(),
profile.handle(),
url.getHost(),
url.getPort(),
user,
pw,
false,
profile.timezoneOffset());
return null; return null;
} catch (final Exception e) { } catch (final Exception e) {
// mist // mist
@ -3080,7 +3092,8 @@ public final class Switchboard extends serverSwitch {
"CRAWLING-ROOT", "CRAWLING-ROOT",
new Date(), new Date(),
profile.handle(), profile.handle(),
0 0,
profile.timezoneOffset()
)); ));
if (reasonString != null) return reasonString; if (reasonString != null) return reasonString;
@ -3134,7 +3147,7 @@ public final class Switchboard extends serverSwitch {
* @throws IOException * @throws IOException
* @throws Parser.Failure * @throws Parser.Failure
*/ */
public void addToIndex(final Collection<DigestURL> urls, final SearchEvent searchEvent, final String heuristicName, final Map<String, Pattern> collections, boolean doublecheck) { public void addToIndex(final Collection<DigestURL> urls, final SearchEvent searchEvent, final String heuristicName, final Map<String, Pattern> collections, final boolean doublecheck) {
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>(); Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url); for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
if (searchEvent != null) { if (searchEvent != null) {
@ -3192,7 +3205,7 @@ public final class Switchboard extends serverSwitch {
} }
final Condenser condenser = new Condenser( final Condenser condenser = new Condenser(
document, null, true, true, LibraryProvider.dymLib, true, document, null, true, true, LibraryProvider.dymLib, true,
Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts)); Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts), searchEvent.query.timezoneOffset);
ResultImages.registerImages(url, document, true); ResultImages.registerImages(url, document, true);
Switchboard.this.webStructure.generateCitationReference(url, document); Switchboard.this.webStructure.generateCitationReference(url, document);
storeDocumentIndex( storeDocumentIndex(
@ -3546,7 +3559,7 @@ public final class Switchboard extends serverSwitch {
final Map<AnchorURL, String> links; final Map<AnchorURL, String> links;
searchEvent.oneFeederStarted(); searchEvent.oneFeederStarted();
try { try {
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent, searchEvent.query.timezoneOffset);
if ( links != null ) { if ( links != null ) {
final Iterator<AnchorURL> i = links.keySet().iterator(); final Iterator<AnchorURL> i = links.keySet().iterator();
while ( i.hasNext() ) { while ( i.hasNext() ) {
@ -3585,7 +3598,7 @@ public final class Switchboard extends serverSwitch {
final Map<AnchorURL, String> links; final Map<AnchorURL, String> links;
DigestURL url; DigestURL url;
try { try {
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent, 0);
if (links != null) { if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<AnchorURL> i = links.keySet().iterator(); final Iterator<AnchorURL> i = links.keySet().iterator();

@ -61,18 +61,27 @@ public class DocumentIndex extends Segment {
} catch (final MalformedURLException e ) { } catch (final MalformedURLException e ) {
} }
} }
BlockingQueue<AnchorURL> queue; // a queue of document ID's private BlockingQueue<AnchorURL> queue; // a queue of document ID's
private final Worker[] worker; private final Worker[] worker;
CallbackListener callback; private CallbackListener callback;
private int timezoneOffset;
static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup"); static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup");
public DocumentIndex(final File segmentPath, final File archivePath, final File collectionConfigurationPath, final File webgraphConfigurationPath, final CallbackListener callback, final int cachesize) public DocumentIndex(
final File segmentPath,
final File archivePath,
final File collectionConfigurationPath,
final File webgraphConfigurationPath,
final CallbackListener callback,
final int cachesize,
final int timezoneOffset)
throws IOException { throws IOException {
super(new ConcurrentLog("DocumentIndex"), segmentPath, archivePath, super(new ConcurrentLog("DocumentIndex"), segmentPath, archivePath,
collectionConfigurationPath == null ? null : new CollectionConfiguration(collectionConfigurationPath, true), collectionConfigurationPath == null ? null : new CollectionConfiguration(collectionConfigurationPath, true),
webgraphConfigurationPath == null ? null : new WebgraphConfiguration(webgraphConfigurationPath, true) webgraphConfigurationPath == null ? null : new WebgraphConfiguration(webgraphConfigurationPath, true)
); );
this.timezoneOffset = timezoneOffset;
super.connectRWI(cachesize, targetFileSize * 4 - 1); super.connectRWI(cachesize, targetFileSize * 4 - 1);
super.connectCitation(cachesize, targetFileSize * 4 - 1); super.connectCitation(cachesize, targetFileSize * 4 - 1);
super.fulltext().connectLocalSolr(); super.fulltext().connectLocalSolr();
@ -99,7 +108,7 @@ public class DocumentIndex extends Segment {
try { try {
while ( (f = DocumentIndex.this.queue.take()) != poison ) { while ( (f = DocumentIndex.this.queue.take()) != poison ) {
try { try {
resultRows = add(f); resultRows = add(f, DocumentIndex.this.timezoneOffset);
for ( final SolrInputDocument resultRow : resultRows ) { for ( final SolrInputDocument resultRow : resultRows ) {
if ( DocumentIndex.this.callback != null ) { if ( DocumentIndex.this.callback != null ) {
if ( resultRow == null ) { if ( resultRow == null ) {
@ -132,7 +141,7 @@ public class DocumentIndex extends Segment {
this.queue.clear(); this.queue.clear();
} }
private SolrInputDocument[] add(final AnchorURL url) throws IOException { private SolrInputDocument[] add(final AnchorURL url, final int timezoneOffset) throws IOException {
if ( url == null ) { if ( url == null ) {
throw new IOException("file = null"); throw new IOException("file = null");
} }
@ -150,7 +159,7 @@ public class DocumentIndex extends Segment {
length = -1; length = -1;
} }
try { try {
documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), timezoneOffset, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null));
} catch (final Exception e ) { } catch (final Exception e ) {
throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage()); throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
} }
@ -159,7 +168,7 @@ public class DocumentIndex extends Segment {
int c = 0; int c = 0;
for ( final Document document : documents ) { for ( final Document document : documents ) {
if (document == null) continue; if (document == null) continue;
final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true); final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true, 0);
rows[c++] = rows[c++] =
super.storeDocument( super.storeDocument(
url, url,

@ -761,7 +761,7 @@ public class Segment {
} }
// get the word set // get the word set
Set<String> words = null; Set<String> words = null;
words = new Condenser(document, null, true, true, null, false, false).words().keySet(); words = new Condenser(document, null, true, true, null, false, false, 0).words().keySet();
// delete all word references // delete all word references
int count = 0; int count = 0;

@ -315,7 +315,7 @@ public class AccessTracker {
byte[] b = new byte[GenericFormatter.PATTERN_SHORT_SECOND.length()]; byte[] b = new byte[GenericFormatter.PATTERN_SHORT_SECOND.length()];
raf.readFully(b); raf.readFully(b);
try { try {
return GenericFormatter.SHORT_SECOND_FORMATTER.parse(UTF8.String(b)); return GenericFormatter.SHORT_SECOND_FORMATTER.parse(UTF8.String(b), 0).getTime();
} catch (ParseException e) { } catch (ParseException e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} }
@ -326,8 +326,8 @@ public class AccessTracker {
String file = args[0]; String file = args[0];
Date from; Date from;
try { try {
from = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[1]); from = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[1], 0).getTime();
Date to = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[2]); Date to = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[2], 0).getTime();
List<EventTracker.Event> dump = readLog(new File(file), from, to); List<EventTracker.Event> dump = readLog(new File(file), from, to);
for (EventTracker.Event s: dump) System.out.println(s.toString()); for (EventTracker.Event s: dump) System.out.println(s.toString());
} catch (ParseException e) { } catch (ParseException e) {

@ -41,8 +41,10 @@ public class QueryModifier {
private final StringBuilder modifier; private final StringBuilder modifier;
public String sitehost, sitehash, filetype, protocol, language, author, collection, on, from, to; public String sitehost, sitehash, filetype, protocol, language, author, collection, on, from, to;
public int timezoneOffset;
public QueryModifier() { public QueryModifier(final int timezoneOffset) {
this.timezoneOffset = timezoneOffset;
this.sitehash = null; this.sitehash = null;
this.sitehost = null; this.sitehost = null;
this.filetype = null; this.filetype = null;
@ -274,19 +276,19 @@ public class QueryModifier {
if (fq.indexOf(CollectionSchema.dates_in_content_dts.getSolrFieldName()) < 0) { if (fq.indexOf(CollectionSchema.dates_in_content_dts.getSolrFieldName()) < 0) {
if (this.on != null && this.on.length() > 0) { if (this.on != null && this.on.length() > 0) {
fq.append(" AND ").append(QueryModifier.parseOnExpression(this.on)); fq.append(" AND ").append(QueryModifier.parseOnExpression(this.on, this.timezoneOffset));
} }
if (this.from != null && this.from.length() > 0 && (this.to == null || this.to.equals("*"))) { if (this.from != null && this.from.length() > 0 && (this.to == null || this.to.equals("*"))) {
fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, null)); fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, null, this.timezoneOffset));
} }
if ((this.from == null || this.from.equals("*")) && this.to != null && this.to.length() > 0) { if ((this.from == null || this.from.equals("*")) && this.to != null && this.to.length() > 0) {
fq.append(" AND ").append(QueryModifier.parseFromToExpression(null, this.to)); fq.append(" AND ").append(QueryModifier.parseFromToExpression(null, this.to, this.timezoneOffset));
} }
if (this.from != null && this.from.length() > 0 && this.to != null && this.to.length() > 0) { if (this.from != null && this.from.length() > 0 && this.to != null && this.to.length() > 0) {
fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, this.to)); fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, this.to, this.timezoneOffset));
} }
} }
@ -348,9 +350,9 @@ public class QueryModifier {
return fq.toString(); return fq.toString();
} }
public static String parseOnExpression(String onDescription) { public static String parseOnExpression(final String onDescription, final int timezoneOffset) {
assert onDescription != null; assert onDescription != null;
Date onDate = DateDetection.parseLine(onDescription); Date onDate = DateDetection.parseLine(onDescription, timezoneOffset);
StringBuilder filterQuery = new StringBuilder(20); StringBuilder filterQuery = new StringBuilder(20);
if (onDate != null) { if (onDate != null) {
@SuppressWarnings({ "deprecation", "static-access" }) @SuppressWarnings({ "deprecation", "static-access" })
@ -360,9 +362,9 @@ public class QueryModifier {
return filterQuery.toString(); return filterQuery.toString();
} }
public static String parseFromToExpression(String from, String to) { public static String parseFromToExpression(final String from, final String to, final int timezoneOffset) {
Date fromDate = from == null || from.equals("*") ? null : DateDetection.parseLine(from); Date fromDate = from == null || from.equals("*") ? null : DateDetection.parseLine(from, timezoneOffset);
Date toDate = to == null || to.equals("*") ? null : DateDetection.parseLine(to); Date toDate = to == null || to.equals("*") ? null : DateDetection.parseLine(to, timezoneOffset);
StringBuilder filterQuery = new StringBuilder(20); StringBuilder filterQuery = new StringBuilder(20);
if (fromDate != null && toDate != null) { if (fromDate != null && toDate != null) {
@SuppressWarnings({ "deprecation", "static-access" }) @SuppressWarnings({ "deprecation", "static-access" })

@ -70,7 +70,6 @@ import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.DisMaxParams; import org.apache.solr.common.params.DisMaxParams;
import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.FacetParams;
import org.apache.solr.schema.TrieDateField; import org.apache.solr.schema.TrieDateField;
@ -146,6 +145,7 @@ public final class QueryParams {
public LinkedHashSet<String> facetfields; public LinkedHashSet<String> facetfields;
private SolrQuery cachedQuery; private SolrQuery cachedQuery;
private CollectionConfiguration solrSchema; private CollectionConfiguration solrSchema;
public final int timezoneOffset;
public QueryParams( public QueryParams(
final QueryGoal queryGoal, final QueryGoal queryGoal,
@ -154,6 +154,7 @@ public final class QueryParams {
final String prefer, final String prefer,
final ContentDomain contentdom, final ContentDomain contentdom,
final String language, final String language,
final int timezoneOffset,
final Collection<Tagging.Metatag> metatags, final Collection<Tagging.Metatag> metatags,
final CacheStrategy snippetCacheStrategy, final CacheStrategy snippetCacheStrategy,
final int itemsPerPage, final int itemsPerPage,
@ -183,6 +184,7 @@ public final class QueryParams {
this.ranking = ranking; this.ranking = ranking;
this.maxDistance = maxDistance; this.maxDistance = maxDistance;
this.contentdom = contentdom; this.contentdom = contentdom;
this.timezoneOffset = timezoneOffset;
this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage); this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage);
this.offset = Math.max(0, Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset)); this.offset = Math.max(0, Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset));
try { try {
@ -527,19 +529,19 @@ public final class QueryParams {
if (this.solrSchema.contains(CollectionSchema.dates_in_content_dts)) { if (this.solrSchema.contains(CollectionSchema.dates_in_content_dts)) {
if (this.modifier.on != null && this.modifier.on.length() > 0) { if (this.modifier.on != null && this.modifier.on.length() > 0) {
fqs.add(QueryModifier.parseOnExpression(this.modifier.on)); fqs.add(QueryModifier.parseOnExpression(this.modifier.on, this.timezoneOffset));
} }
if (this.modifier.from != null && this.modifier.from.length() > 0 && (this.modifier.to == null || this.modifier.to.equals("*"))) { if (this.modifier.from != null && this.modifier.from.length() > 0 && (this.modifier.to == null || this.modifier.to.equals("*"))) {
fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, null)); fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, null, this.timezoneOffset));
} }
if ((this.modifier.from == null || this.modifier.from.equals("*")) && this.modifier.to != null && this.modifier.to.length() > 0) { if ((this.modifier.from == null || this.modifier.from.equals("*")) && this.modifier.to != null && this.modifier.to.length() > 0) {
fqs.add(QueryModifier.parseFromToExpression(null, this.modifier.to)); fqs.add(QueryModifier.parseFromToExpression(null, this.modifier.to, this.timezoneOffset));
} }
if (this.modifier.from != null && this.modifier.from.length() > 0 && this.modifier.to != null && this.modifier.to.length() > 0) { if (this.modifier.from != null && this.modifier.from.length() > 0 && this.modifier.to != null && this.modifier.to.length() > 0) {
fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, this.modifier.to)); fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, this.modifier.to, this.timezoneOffset));
} }
} }

@ -358,7 +358,8 @@ public final class HTTPDProxyHandler {
"", "",
cachedResponseHeader.lastModified(), cachedResponseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0); 0,
sb.crawler.defaultProxyProfile.timezoneOffset());
final Response response = new Response( final Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -473,8 +474,8 @@ public final class HTTPDProxyHandler {
"", "",
responseHeader.lastModified(), responseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0); 0,
sb.crawler.defaultProxyProfile.timezoneOffset());
// handle incoming cookies // handle incoming cookies
handleIncomingCookies(responseHeader, host, ip); handleIncomingCookies(responseHeader, host, ip);

Loading…
Cancel
Save