enhanced timezone managament for indexed data:

to support the new time parser and search functions in YaCy a high
precision detection of date and time on the day is necessary. That
requires that the time zone of the document content and the time zone of
the user, doing a search, is detected. The time zone of the search
request is done automatically using the browsers time zone offset which
is delivered to the search request automatically and invisible to the
user. The time zone for the content of web pages cannot be detected
automatically and must be an attribute of crawl starts. The advanced
crawl start now provides an input field to set the time zone in minutes
as an offset number. All parsers must get a time zone offset passed, so
this required the change of the parser java api. A lot of other changes
had been made which corrects the wrong handling of dates in YaCy which
was to add a correction based on the time zone of the server. Now no
correction is added and all dates in YaCy are UTC/GMT time zone, a
normalized time zone for all peers.
pull/2/head
Michael Peter Christen 10 years ago
parent 702c30e619
commit fed26f33a8

@ -513,7 +513,7 @@
</dl>
</fieldset>
<fieldset>
<legend>Index Administration</legend>
<legend>Index Attributes</legend>
<dl>
<dt>Indexing</dt>
<dd>
@ -561,6 +561,17 @@
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
</dd>
<dt><label for="collection">Time Zone Offset</label></dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
The time zone is required when the parser detects a date in the crawled web page. Content can be searched with the on: - modifier which
requires also a time zone when a query is made. To normalize all given dates, the date is stored in UTC time zone. To get the right offset
from dates without time zones to UTC, this offset must be given here. The offset is given in minutes;
Time zone offsets for locations east of UTC must be negative; offsets for zones west of UTC must be positve.
</span></span>
<input id="timezoneOffset" type="text" size="4" maxlength="4" name="timezoneOffset" value=""><script>document.getElementById("timezoneOffset").value = new Date().getTimezoneOffset();</script>
</dd>
</dl>
</fieldset>

@ -91,6 +91,7 @@
<input type="hidden" name="indexText" id="indexText" value="on" />
<input type="hidden" name="indexMedia" id="indexMedia" value="on" />
<input type="hidden" name="intention" id="intention" value="" />
<input id="timezoneOffset" type="hidden" name="timezoneOffset" value=""><script>document.getElementById("timezoneOffset").value = new Date().getTimezoneOffset();</script>
<input type="submit" name="crawlingstart" value="Start New Crawl" class="btn btn-primary"/>
</dd>
</dl>

@ -470,6 +470,8 @@ public class Crawler_p {
}
}
int timezoneOffset = post.getInt("timezoneOffset", 0);
// prepare a new crawling profile
final CrawlProfile profile;
byte[] handle;
@ -502,7 +504,8 @@ public class Crawler_p {
cachePolicy,
collection,
agentName,
new VocabularyScraper(vocabulary_scraper));
new VocabularyScraper(vocabulary_scraper),
timezoneOffset);
handle = ASCII.getBytes(profile.handle());
// before we fire up a new crawl, we make sure that another crawl with the same name is not running
@ -585,7 +588,7 @@ public class Crawler_p {
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper());
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if (crawlingFile != null && crawlingFile.exists()) {
FileUtils.copy(new FileInputStream(crawlingFile), writer);
@ -605,7 +608,7 @@ public class Crawler_p {
}
sb.crawler.putActive(handle, profile);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, profile.timezoneOffset());
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);

@ -161,7 +161,8 @@ public class HostBrowser {
sb.peers.mySeed().hash.getBytes(),
url, null, load, new Date(),
sb.crawler.defaultProxyProfile.handle(),
0
0,
sb.crawler.defaultProxyProfile.timezoneOffset()
));
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) waitloop: for (int i = 0; i < 30; i++) {

@ -637,11 +637,12 @@ public class IndexControlRWIs_p {
final QueryGoal qg = new QueryGoal(queryhashes, null);
final QueryParams query = new QueryParams(
qg,
new QueryModifier(),
new QueryModifier(0),
Integer.MAX_VALUE,
"",
ContentDomain.ALL,
"", //lang
0, //timezoneOffset
null,
CacheStrategy.IFFRESH,
1000, 0, //count, offset

@ -74,7 +74,7 @@ public class NetworkHistory {
while (rowi.hasNext()) {
Row row = rowi.next();
String d = ASCII.String(row.getPK());
Date date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d);
Date date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d, 0).getTime();
if (date.getTime() < timelimit) break;
statrow = new HashMap<>();
for (String key: columns) {

@ -128,6 +128,7 @@ public class QuickCrawlLink_p {
final byte[] urlhash = crawlingStartURL.hash();
indexSegment.fulltext().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
int timezoneOffset = post.getInt("timezoneOffset", 0);
// create crawling profile
CrawlProfile pe = null;
@ -156,7 +157,8 @@ public class QuickCrawlLink_p {
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
null,
timezoneOffset);
sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) {
// mist
@ -175,7 +177,8 @@ public class QuickCrawlLink_p {
(title==null)?"CRAWLING-ROOT":title,
new Date(),
pe.handle(),
0
0,
pe.timezoneOffset()
));
// validate rejection reason

@ -39,7 +39,7 @@ public class get {
Date parsedDate = null;
try {
parsedDate = ISO8601Formatter.FORMATTER.parse(date);
parsedDate = ISO8601Formatter.FORMATTER.parse(date, 0).getTime();
} catch (final ParseException e) {
parsedDate = new Date();
}

@ -103,7 +103,8 @@ public class push_p {
"", // the name of the document to crawl
new Date(), // current date
profile.handle(), // the name of the prefetch profile. This must not be null!
0); // forkfactor sum of anchors of all ancestors
0, // forkfactor sum of anchors of all ancestors
profile.timezoneOffset());
Response response = new Response(
request,
requestHeader,

@ -75,8 +75,8 @@ public final class timeline_p {
// get a time period
Date fromDate = new Date(0);
Date toDate = new Date();
try {fromDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("from", "20031215182700"));} catch (ParseException e) {}
try {toDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("to", GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date())));} catch (ParseException e) {}
try {fromDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("from", "20031215182700"), 0).getTime();} catch (ParseException e) {}
try {toDate = GenericFormatter.SHORT_SECOND_FORMATTER.parse(post.get("to", GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date())), 0).getTime();} catch (ParseException e) {}
// get latest dump;
AccessTracker.dumpLog();

@ -80,6 +80,7 @@
<input type="hidden" name="resource" value="global" />
<input type="hidden" name="prefermaskfilter" value="" />
<input type="hidden" name="maximumRecords" value="#[maximumRecords]#" />
<input id="timezoneOffset" type="hidden" name="timezoneOffset" value=""><script>document.getElementById("timezoneOffset").value = new Date().getTimezoneOffset();</script>
</fieldset>
::
</fieldset>

@ -78,7 +78,8 @@ public class rct_p {
"REMOTE-CRAWLING",
loaddate,
sb.crawler.defaultRemoteProfile.handle(),
0));
0,
sb.crawler.defaultRemoteProfile.timezoneOffset()));
} else {
env.getLog().warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}

@ -118,7 +118,8 @@ public final class search {
final String prefer = post.get("prefer", "");
final String contentdom = post.get("contentdom", "all");
final String filter = post.get("filter", ".*"); // a filter on the url
QueryModifier modifier = new QueryModifier();
final int timezoneOffset = post.getInt("timezoneOffset", 0);
QueryModifier modifier = new QueryModifier(timezoneOffset);
modifier.sitehost = post.get("sitehost", ""); if (modifier.sitehost.isEmpty()) modifier.sitehost = null;
modifier.sitehash = post.get("sitehash", ""); if (modifier.sitehash.isEmpty()) modifier.sitehash = null;
modifier.author = post.get("author", ""); if (modifier.author.isEmpty()) modifier.author = null;
@ -232,6 +233,7 @@ public final class search {
prefer,
ContentDomain.contentdomParser(contentdom),
language,
timezoneOffset,
new HashSet<Tagging.Metatag>(),
null, // no snippet computation
count,
@ -297,6 +299,7 @@ public final class search {
prefer,
ContentDomain.contentdomParser(contentdom),
language,
timezoneOffset,
new HashSet<Tagging.Metatag>(),
null, // no snippet computation
count,

@ -55,7 +55,7 @@ public final class transferURL {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
final long start = System.currentTimeMillis();
long freshdate = 0;
try {freshdate = GenericFormatter.SHORT_DAY_FORMATTER.parse("20061101").getTime();} catch (final ParseException e1) {}
try {freshdate = GenericFormatter.SHORT_DAY_FORMATTER.parse("20061101", 0).getTime().getTime();} catch (final ParseException e1) {}
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;

@ -108,6 +108,7 @@ Use the RSS search result format to add static searches to your RSS reader, if y
<input type="hidden" name="depth" value="#[depth]#" />
<input type="hidden" name="constraint" value="#[constraint]#" />
<input type="hidden" name="meanCount" value="#[meanCount]#" />
<input id="timezoneOffset" type="hidden" name="timezoneOffset" value=""><script>document.getElementById("timezoneOffset").value = new Date().getTimezoneOffset();</script>
</form>
<!-- type the number of results and navigation bar -->

@ -214,6 +214,9 @@ public class yacysearch {
prop.setOutgoingHeader(outgoingHeader);
}
// time zone
int timezoneOffset = post.getInt("timezoneOffset", 0);
// collect search attributes
int itemsPerPage =
@ -359,7 +362,7 @@ public class yacysearch {
}
final RankingProfile ranking = sb.getRanking();
final QueryModifier modifier = new QueryModifier();
final QueryModifier modifier = new QueryModifier(timezoneOffset);
querystring = modifier.parse(querystring);
if (modifier.sitehost != null && modifier.sitehost.length() > 0 && querystring.length() == 0) querystring = "*"; // allow to search for all documents on a host
@ -643,6 +646,7 @@ public class yacysearch {
prefermask,
contentdom,
language,
timezoneOffset,
metatags,
snippetFetchStrategy,
itemsPerPage,

@ -390,9 +390,9 @@ public class yacysearchtrailer {
navigatorIterator = theSearch.dateNavigator.iterator(); // this iterator is different as it iterates by the key order (which is a date order)
int i = 0, pos = 0, neg = 0;
long dx = -1;
Date fromconstraint = theSearch.getQuery().modifier.from == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.from);
Date fromconstraint = theSearch.getQuery().modifier.from == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.from, theSearch.getQuery().timezoneOffset);
if (fromconstraint == null) fromconstraint = new Date(System.currentTimeMillis() - AbstractFormatter.normalyearMillis);
Date toconstraint = theSearch.getQuery().modifier.to == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.to);
Date toconstraint = theSearch.getQuery().modifier.to == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.to, theSearch.getQuery().timezoneOffset);
if (toconstraint == null) toconstraint = new Date(System.currentTimeMillis() + AbstractFormatter.normalyearMillis);
while (i < QueryParams.FACETS_DATE_MAXCOUNT && navigatorIterator.hasNext()) {
name = navigatorIterator.next().trim();

@ -25,12 +25,18 @@
package net.yacy.cora.date;
import java.text.ParseException;
import java.util.Calendar;
import java.util.Date;
import java.util.TimeZone;
public abstract class AbstractFormatter implements DateFormatter {
protected static final TimeZone TZ_GMT = TimeZone.getTimeZone("GMT");
public final static Calendar testCalendar = Calendar.getInstance(); // a calendar in the current time zone of the server
public final static Calendar UTCCalendar = Calendar.getInstance();
public final static TimeZone UTCtimeZone = TimeZone.getTimeZone("UTC");
static {
UTCCalendar.setTimeZone(UTCtimeZone);
}
// statics
public final static long secondMillis = 1000;
@ -45,7 +51,7 @@ public abstract class AbstractFormatter implements DateFormatter {
protected String last_format;
@Override
public abstract Date parse(String s) throws ParseException;
public abstract Calendar parse(String s, int timezoneOffset) throws ParseException;
@Override
public abstract String format(final Date date);
@Override

@ -25,11 +25,12 @@
package net.yacy.cora.date;
import java.text.ParseException;
import java.util.Calendar;
import java.util.Date;
public interface DateFormatter {
public Date parse(String s) throws ParseException;
public Calendar parse(String s, int timezoneOffset) throws ParseException;
public String format(final Date date);
public String format();

@ -30,6 +30,7 @@ import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
import net.yacy.cora.util.NumberTools;
@ -51,14 +52,11 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter
public static final SimpleDateFormat FORMAT_ANSIC = new SimpleDateFormat(PATTERN_ANSIC, Locale.US);
public static final SimpleDateFormat FORMAT_SIMPLE = new SimpleDateFormat(PATTERN_SIMPLE, Locale.US);
// find out time zone and DST offset
private static Calendar thisCalendar = Calendar.getInstance();
static {
// we want GMT times on the formats as well as they don't support any timezone
FORMAT_SHORT_DAY.setTimeZone(TZ_GMT);
FORMAT_SHORT_SECOND.setTimeZone(TZ_GMT);
FORMAT_SHORT_MILSEC.setTimeZone(TZ_GMT);
FORMAT_SHORT_DAY.setTimeZone(UTCtimeZone);
FORMAT_SHORT_SECOND.setTimeZone(UTCtimeZone);
FORMAT_SHORT_MILSEC.setTimeZone(UTCtimeZone);
}
public static final long time_second = 1000L;
@ -124,56 +122,55 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter
* the String.
*/
@Override
public Date parse(final String timeString) throws ParseException {
public Calendar parse(final String timeString, final int timezoneOffset) throws ParseException {
synchronized (this.dateFormat) {
return this.dateFormat.parse(timeString);
Calendar cal = Calendar.getInstance(UTCtimeZone);
cal.setTime(this.dateFormat.parse(timeString));
cal.add(Calendar.MINUTE, timezoneOffset); // add a correction; i.e. for UTC+1 -60 minutes is added to patch a time given in UTC+1 to the actual time at UTC
return cal;
}
}
/**
* Like {@link #parseShortSecond(String)} using additional timezone information provided in an
* offset String, like "+0100" for CET.
* @throws ParseException
*/
public Date parse(final String timeString, final String UTCOffset) {
public Calendar parse(final String timeString, final String UTCOffset) throws ParseException {
// FIXME: This method returns an incorrect date, check callers!
// ex: de.anomic.server.serverDate.parseShortSecond("20070101120000", "+0200").toGMTString()
// => 1 Jan 2007 13:00:00 GMT
if (timeString == null || timeString.isEmpty()) { return new Date(); }
if (UTCOffset == null || UTCOffset.isEmpty()) { return new Date(); }
try {
synchronized (this.dateFormat) {
return new Date(this.dateFormat.parse(timeString).getTime() - UTCDiff() + UTCDiff(UTCOffset));
}
} catch (final Throwable e) {
//serverLog.logFinest("parseUniversalDate", e.getMessage() + ", remoteTimeString=[" + remoteTimeString + "]");
return new Date();
}
if (timeString == null || timeString.isEmpty()) { return Calendar.getInstance(UTCtimeZone); }
if (UTCOffset == null || UTCOffset.isEmpty()) { return Calendar.getInstance(UTCtimeZone); }
return parse(timeString, UTCDiff(UTCOffset));
}
private static long UTCDiff(final String diffString) {
private static int UTCDiff(final String diffString) {
if (diffString.length() != 5) throw new IllegalArgumentException("UTC String malformed (wrong size):" + diffString);
boolean ahead = true;
if (diffString.length() > 0 && diffString.charAt(0) == '+') ahead = true;
else if (diffString.length() > 0 && diffString.charAt(0) == '-') ahead = false;
else throw new IllegalArgumentException("UTC String malformed (wrong sign):" + diffString);
final long oh = NumberTools.parseLongDecSubstring(diffString, 1, 3);
final long om = NumberTools.parseLongDecSubstring(diffString, 3);
return ((ahead) ? (long) 1 : (long) -1) * (oh * AbstractFormatter.hourMillis + om * AbstractFormatter.minuteMillis);
final int oh = NumberTools.parseIntDecSubstring(diffString, 1, 3);
final int om = NumberTools.parseIntDecSubstring(diffString, 3);
return (int) ((ahead) ? 1 : -1 * (oh * AbstractFormatter.hourMillis + om * AbstractFormatter.minuteMillis));
}
/**
* get the difference of this servers time zone to UTC/GMT in milliseconds
* @return
*/
private static long UTCDiff() {
// DST_OFFSET is dependent on the time of the Calendar, so it has to be updated
// to get the correct current offset
synchronized (thisCalendar) {
thisCalendar.setTimeInMillis(System.currentTimeMillis());
final long zoneOffsetHours = thisCalendar.get(Calendar.ZONE_OFFSET);
final long DSTOffsetHours = thisCalendar.get(Calendar.DST_OFFSET);
synchronized (testCalendar) {
testCalendar.setTimeInMillis(System.currentTimeMillis());
final long zoneOffsetHours = testCalendar.get(Calendar.ZONE_OFFSET);
final long DSTOffsetHours = testCalendar.get(Calendar.DST_OFFSET);
return zoneOffsetHours + DSTOffsetHours;
}
}
private final static DecimalFormat D2 = new DecimalFormat("00");
public static String UTCDiffString() {
// we express the UTC Difference in 5 digits:
// SHHMM
@ -195,11 +192,9 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter
return sb.toString();
}
public static long correctedUTCTime() {
return System.currentTimeMillis() - UTCDiff();
}
private final static DecimalFormat D2 = new DecimalFormat("00");
public static void main(final String[] args) {
public static void main(String[] args) {
System.out.println(UTCDiffString());
}
}

@ -41,7 +41,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
private static final SimpleDateFormat FORMAT_ISO8601 = new SimpleDateFormat(PATTERN_ISO8601, Locale.US);
static {
FORMAT_ISO8601.setTimeZone(TZ_GMT);
FORMAT_ISO8601.setTimeZone(AbstractFormatter.UTCtimeZone);
}
public static final ISO8601Formatter FORMATTER = new ISO8601Formatter();
@ -72,7 +72,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
* @throws ParseException
*/
@Override
public Date parse(String s) throws ParseException {
public Calendar parse(String s, final int timezoneOffset) throws ParseException {
// do some lazy checks here
s = s.trim();
while (!s.isEmpty() && s.endsWith("?")) s = s.substring(0, s.length() - 1); // sometimes used if write is not sure about date
@ -87,7 +87,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
while (!s.isEmpty() && s.endsWith("?")) s = s.substring(0, s.length() - 1); // sometimes used if write is not sure about date
// no go for exact parsing
final Calendar cal = Calendar.getInstance(TZ_GMT, Locale.US);
final Calendar cal = Calendar.getInstance(AbstractFormatter.UTCtimeZone, Locale.US);
cal.clear();
// split 2007-12-19T10:20:30.789+0500 into its parts
@ -103,13 +103,13 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
if (t.nextToken().equals("-")) {
cal.set(Calendar.MONTH, Integer.parseInt(t.nextToken()) - 1);
} else {
return cal.getTime();
return cal;
}
// day
if (t.nextToken().equals("-")) {
cal.set(Calendar.DAY_OF_MONTH, Integer.parseInt(t.nextToken()));
} else {
return cal.getTime();
return cal;
}
// The standard says:
// if there is an hour there has to be a minute and a timezone token, too.
@ -147,7 +147,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
sign = -1;
} else {
// no legal TZ offset found
return cal.getTime();
return cal;
}
offset = sign * Integer.parseInt(t.nextToken()) * 10 * 3600;
}
@ -168,8 +168,7 @@ public class ISO8601Formatter extends AbstractFormatter implements DateFormatter
// in case we couldn't even parse a year
if (!cal.isSet(Calendar.YEAR))
throw new ParseException("parseISO8601: Cannot parse '" + s + "'", 0);
Date d = cal.getTime();
return d;
return cal;
}

@ -224,7 +224,7 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
date = HeaderFramework.FORMAT_RFC1123.parse(dateString);
} catch (final ParseException e) {
try {
date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(dateString);
date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(dateString, 0).getTime();
} catch (final ParseException e1) {
date = HeaderFramework.parseHTTPDate(dateString); // returns null on parse error
}

@ -183,11 +183,12 @@ public class FederateSearchManager {
Bitfield filter = new Bitfield();
final QueryParams query = new QueryParams(
qg,
new QueryModifier(),
new QueryModifier(0),
Integer.MAX_VALUE,
"",
Classification.ContentDomain.ALL,
"", //lang
0, //timezoneOffset
null,
CacheStrategy.IFFRESH,
100, 0, //count, offset

@ -151,17 +151,26 @@ public final class CrawlStacker {
if (CrawlStacker.log.isFinest()) CrawlStacker.log.finest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth());
this.requestQueue.enQueue(entry);
}
public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final List<AnchorURL> hyperlinks) {
public void enqueueEntriesAsynchronous(
final byte[] initiator,
final String profileHandle,
final List<AnchorURL> hyperlinks,
final int timezoneOffset) {
new Thread() {
@Override
public void run() {
Thread.currentThread().setName("enqueueEntriesAsynchronous");
enqueueEntries(initiator, profileHandle, hyperlinks, true);
enqueueEntries(initiator, profileHandle, hyperlinks, true, timezoneOffset);
}
}.start();
}
private void enqueueEntries(final byte[] initiator, final String profileHandle, final List<AnchorURL> hyperlinks, final boolean replace) {
private void enqueueEntries(
final byte[] initiator,
final String profileHandle,
final List<AnchorURL> hyperlinks,
final boolean replace,
final int timezoneOffset) {
if (replace) {
// delete old entries, if exists to force a re-load of the url (thats wanted here)
Set<String> hosthashes = new HashSet<String>();
@ -199,7 +208,7 @@ public final class CrawlStacker {
int p = userInfo == null ? -1 : userInfo.indexOf(':');
String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p);
String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1);
enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), user, pw, replace);
enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), user, pw, replace, timezoneOffset);
} else {
// put entry on crawl stack
enqueueEntry(new Request(
@ -209,13 +218,22 @@ public final class CrawlStacker {
url.getNameProperty(),
new Date(),
profileHandle,
0
0,
timezoneOffset
));
}
}
}
public void enqueueEntriesFTP(final byte[] initiator, final String profileHandle, final String host, final int port, final String user, final String pw, final boolean replace) {
public void enqueueEntriesFTP(
final byte[] initiator,
final String profileHandle,
final String host,
final int port,
final String user,
final String pw,
final boolean replace,
final int timezoneOffset) {
final CrawlQueues cq = this.nextQueue;
new Thread() {
@Override
@ -248,7 +266,8 @@ public final class CrawlStacker {
MultiProtocolURL.unescape(entry.name),
entry.date,
profileHandle,
0));
0,
timezoneOffset));
}
} catch (final IOException e1) {
ConcurrentLog.logException(e1);
@ -272,7 +291,7 @@ public final class CrawlStacker {
"CRAWLING-ROOT",
new Date(),
pe.handle(),
0));
0, 0));
}
/**

@ -296,7 +296,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName,
null);
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()),
this.defaultProxyProfile);
@ -327,7 +328,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName,
null);
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()),
this.defaultRemoteProfile);
@ -358,7 +360,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
this.defaultTextSnippetLocalProfile);
@ -389,7 +392,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
@ -421,7 +425,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName,
null);
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
@ -452,7 +457,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
this.defaultMediaSnippetLocalProfile);
@ -483,7 +489,8 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
this.defaultMediaSnippetGlobalProfile);
@ -514,7 +521,8 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
this.defaultSurrogateProfile);
@ -548,7 +556,8 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
null,
0);
this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);
this.defaultPushProfiles.put(collection, genericPushProfile);
return genericPushProfile;

@ -80,6 +80,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String CACHE_STRAGEGY = "cacheStrategy";
public static final String COLLECTIONS = "collections";
public static final String SCRAPER = "scraper";
public static final String TIMEZONEOFFSET = "timezoneOffset";
public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch";
public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch";
public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch";
@ -131,6 +132,9 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @param xpstopw true if parent stop words shall be ignored
* @param cacheStrategy determines if and how cache is used loading content
* @param collections a comma-separated list of tags which are attached to index entries
* @param userAgentName the profile name of the user agent to be used
* @param scraper a scraper for vocabularies
* @param timezoneOffset the time offset in minutes for scraped dates in text without time zone
*/
public CrawlProfile(
String name,
@ -155,7 +159,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final CacheStrategy cacheStrategy,
final String collections,
final String userAgentName,
final VocabularyScraper scraper) {
final VocabularyScraper scraper,
final int timezoneOffset) {
super(40);
if (name == null || name.isEmpty()) {
throw new NullPointerException("name must not be null or empty");
@ -198,6 +203,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
String jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
put(SCRAPER, jsonString);
put(TIMEZONEOFFSET, timezoneOffset);
}
/**
@ -623,6 +629,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString()));
}
public int timezoneOffset() {
final String timezoneOffset = get(TIMEZONEOFFSET);
if (timezoneOffset == null) return 0;
try {
return Integer.parseInt(timezoneOffset);
} catch (NumberFormatException e) {
return 0;
}
}
/**
* get a recrawl date for a given age in minutes
* @param oldTimeMinutes

@ -531,7 +531,8 @@ public class CrawlQueues {
item.getDescriptions().size() > 0 ? item.getDescriptions().get(0) : "",
loaddate,
this.sb.crawler.defaultRemoteProfile.handle(),
0
0,
this.sb.crawler.defaultRemoteProfile.timezoneOffset()
));
} else {
CrawlQueues.log.warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);

@ -359,10 +359,10 @@ public class Snapshots {
private static Date parseDate(String d) {
try {
return GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d);
return GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d, 0).getTime();
} catch (ParseException e) {
try {
return GenericFormatter.SHORT_DAY_FORMATTER.parse(d);
return GenericFormatter.SHORT_DAY_FORMATTER.parse(d, 0).getTime();
} catch (ParseException ee) {
return null;
}

@ -92,6 +92,7 @@ public class Request extends WorkflowJob
private Bitfield flags;
private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
private int timezoneOffset;
public Request() {
// used only to create poison entries
@ -106,6 +107,7 @@ public class Request extends WorkflowJob
this.statusMessage = null;
this.initialHash = 0;
this.status = 0;
this.timezoneOffset = 0;
}
/**
@ -115,7 +117,7 @@ public class Request extends WorkflowJob
* @param referrerhash
*/
public Request(final DigestURL url, final byte[] referrerhash) {
this(null, url, referrerhash, null, null, null, 0);
this(null, url, referrerhash, null, null, null, 0, 0);
}
/**
@ -136,7 +138,8 @@ public class Request extends WorkflowJob
final String name,
final Date appdate,
final String profileHandle,
final int depth) {
final int depth,
final int timezoneOffset) {
// create new entry and store it into database
assert url != null;
assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle
@ -150,6 +153,7 @@ public class Request extends WorkflowJob
this.appdate = (appdate == null) ? 0 : appdate.getTime();
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.timezoneOffset = timezoneOffset;
this.flags = new Bitfield(rowdef.width(10));
this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode();
@ -272,6 +276,10 @@ public class Request extends WorkflowJob
return this.depth;
}
public int timezoneOffset() {
return this.timezoneOffset;
}
public String profileHandle() {
// the handle of the crawl profile
assert this.profileHandle == null || this.profileHandle.length() == Word.commonHashLength : this.profileHandle + " != " + Word.commonHashLength;

@ -28,7 +28,6 @@ package net.yacy.crawler.retrieval;
import java.util.Date;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
@ -260,7 +259,7 @@ public class Response {
if (docDate == null) docDate = this.responseHeader.date();
}
if (docDate == null && this.request != null) docDate = this.request.appdate();
if (docDate == null) docDate = new Date(GenericFormatter.correctedUTCTime());
if (docDate == null) docDate = new Date();
return docDate;
}
@ -372,7 +371,7 @@ public class Response {
if (date == null) return "stale_no_date_given_in_response";
try {
final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl, 8); // milliseconds to live
if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) {
if (System.currentTimeMillis() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "stale_expired";
}
@ -461,8 +460,8 @@ public class Response {
if (!this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { return false; }
// parse date
Date d1, d2;
d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(GenericFormatter.correctedUTCTime()); }
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(GenericFormatter.correctedUTCTime()); }
d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(); }
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(); }
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) { return false; }
}
@ -501,9 +500,10 @@ public class Response {
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
final Date expires = this.responseHeader.expires();
final Date now = new Date();
if (expires != null) {
// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
if (expires.before(new Date(GenericFormatter.correctedUTCTime()))) { return false; }
if (expires.before(now)) { return false; }
}
final Date lastModified = this.responseHeader.lastModified();
cacheControl = this.responseHeader.get(HeaderFramework.CACHE_CONTROL);
@ -517,13 +517,13 @@ public class Response {
// file may only be treated as fresh for one more month, not more.
Date date = this.responseHeader.date();
if (lastModified != null) {
if (date == null) { date = new Date(GenericFormatter.correctedUTCTime()); }
if (date == null) { date = now; }
final long age = date.getTime() - lastModified.getTime();
if (age < 0) { return false; }
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
// therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
if (GenericFormatter.correctedUTCTime() - date.getTime() > age / 10) { return false; }
if (now.getTime() - date.getTime() > age / 10) { return false; }
}
// -cache-control in cached response
@ -542,7 +542,7 @@ public class Response {
if (date == null) { return false; }
try {
final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl, 8); // milliseconds to live
if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) {
if (now.getTime() - date.getTime() > ttl) {
return false;
}
} catch (final Exception e) {
@ -626,12 +626,11 @@ public class Response {
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
final Date ifModifiedSince = this.ifModifiedSince();
final Date now = new Date();
if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
// parse date
Date d = this.responseHeader.lastModified();
if (d == null) {
d = new Date(GenericFormatter.correctedUTCTime());
}
if (d == null) d = now;
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d.after(ifModifiedSince)) {
//System.out.println("***not indexed because if-modified-since");
@ -655,7 +654,7 @@ public class Response {
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
final Date expires = this.responseHeader.expires();
if (expires != null && expires.before(new Date(GenericFormatter.correctedUTCTime()))) {
if (expires != null && expires.before(now)) {
return "Stale_(Expired)";
}
@ -688,7 +687,7 @@ public class Response {
}
try {
final long ttl = 1000 * NumberTools.parseLongDecSubstring(cacheControl,8); // milliseconds to live
if (GenericFormatter.correctedUTCTime() - date.getTime() > ttl) {
if (now.getTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "Stale_(expired_by_cache-control)";
}
@ -865,7 +864,7 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.depth(), this.content);
return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
} catch (final Exception e) {
return null;
}

@ -108,7 +108,8 @@ public class SitemapImporter extends Thread {
entry.url(),
entry.lastmod(new Date()),
this.crawlingProfile.handle(),
0
0,
this.crawlingProfile.timezoneOffset()
));
logger.info("New URL '" + entry.url() + "' added for loading.");
}

@ -210,7 +210,7 @@ public class BlogBoard {
}
try {
date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(StrDate);
date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(StrDate, 0).getTime();
} catch (final ParseException e1) {
date = new Date();
}
@ -404,7 +404,7 @@ public class BlogBoard {
}
return new Date();
}
return GenericFormatter.SHORT_SECOND_FORMATTER.parse(date);
return GenericFormatter.SHORT_SECOND_FORMATTER.parse(date, 0).getTime();
} catch (final ParseException ex) {
return new Date();
}

@ -139,7 +139,7 @@ public class BookmarkHelper {
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try {
//load the links
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper());
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper(), 0);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(input,writer);
@ -232,7 +232,7 @@ public class BookmarkHelper {
Date parsedDate = null;
try {
parsedDate = ISO8601Formatter.FORMATTER.parse(time);
parsedDate = ISO8601Formatter.FORMATTER.parse(time, 0).getTime();
} catch (final ParseException e) {
parsedDate = new Date();
}

@ -87,7 +87,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
}
//get words from document
final Map<String, Word> words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false).words();
final Map<String, Word> words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false, 0).words();
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32;

@ -190,7 +190,8 @@ public class YMarkCrawlStart extends HashMap<String,String>{
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
null); // TODO: make this a default profile in CrawlSwitchboard
null,
0); // TODO: make this a default profile in CrawlSwitchboard
sb.crawler.putActive(pe.handle().getBytes(), pe);
return sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
@ -198,7 +199,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
null,
"CRAWLING-ROOT",
new Date(),
pe.handle(), 0
pe.handle(), 0, pe.timezoneOffset()
));
}
}

@ -97,7 +97,8 @@ public final class Condenser {
final boolean indexMedia,
final WordCache meaningLib,
final boolean doAutotagging,
final boolean findDatesInContent
final boolean findDatesInContent,
final int timezoneOffset
) {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
// if addMedia == true, then all the media links are also parsed and added to the words
@ -123,7 +124,7 @@ public final class Condenser {
Map.Entry<AnchorURL, String> entry;
if (indexText) {
String text = document.getTextString();
if (findDatesInContent) this.dates_in_content = DateDetection.parse(text);
if (findDatesInContent) this.dates_in_content = DateDetection.parse(text, timezoneOffset);
createCondensement(document.dc_source(), text, meaningLib, doAutotagging, scraper);
// the phrase counter:
// phrase 0 are words taken from the URL

@ -499,7 +499,7 @@ public class DateDetection {
* @param text
* @return a set of dates, ordered by time. first date in the ordered set is the oldest time.
*/
public static LinkedHashSet<Date> parse(String text) {
public static LinkedHashSet<Date> parse(String text, int timezoneOffset) {
Long offset;
if ((offset = specialDayOffset.get(text)) != null) {
LinkedHashSet<Date> dates = new LinkedHashSet<>(); dates.add(new Date((System.currentTimeMillis() / AbstractFormatter.dayMillis) * AbstractFormatter.dayMillis + offset.longValue())); return dates;
@ -513,7 +513,7 @@ public class DateDetection {
return dates;
}
public static Date parseLine(String text) {
public static Date parseLine(final String text, final int timezoneOffset) {
Date d = null;
try {d = CONFORM.parse(text);} catch (ParseException e) {}
//if (d == null) try {d = GenericFormatter.FORMAT_SHORT_DAY.parse(text);} catch (ParseException e) {} // did not work well and fired for wrong formats; do not use
@ -521,7 +521,7 @@ public class DateDetection {
if (d == null) try {d = GenericFormatter.FORMAT_ANSIC.parse(text);} catch (ParseException e) {}
if (d == null) {
Set<Date> dd = parse(text);
Set<Date> dd = parse(text, timezoneOffset);
if (dd.size() >= 1) d = dd.iterator().next();
}
return d;
@ -601,7 +601,7 @@ public class DateDetection {
};
long t = System.currentTimeMillis();
for (String s: test) {
String parsed = parse(fill + " " + s + " " + fill).toString();
String parsed = parse(fill + " " + s + " " + fill, 0).toString();
System.out.println("SOURCE: " + s);
System.out.println("DATE : " + parsed);
System.out.println();

@ -59,6 +59,7 @@ public interface Parser {
String mimeType,
String charset,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException;

@ -167,6 +167,7 @@ public final class TextParser {
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
final File sourceFile
) throws InterruptedException, Parser.Failure {
@ -181,7 +182,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location);
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, scraper, depth, sourceFile.length(), sourceStream);
docs = parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -199,6 +200,7 @@ public final class TextParser {
String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
final byte[] content
) throws Parser.Failure {
@ -214,7 +216,7 @@ public final class TextParser {
}
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, content);
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content);
return docs;
}
@ -224,6 +226,7 @@ public final class TextParser {
String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
final long contentLength,
final InputStream sourceStream
@ -244,7 +247,7 @@ public final class TextParser {
// then we use only one stream-oriented parser.
if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
// use a specific stream-oriented parser
return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, sourceStream);
return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, timezoneOffset, sourceStream);
}
// in case that we know more parsers we first transform the content into a byte[] and use that as base
@ -255,7 +258,7 @@ public final class TextParser {
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, b);
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b);
return docs;
}
@ -266,6 +269,7 @@ public final class TextParser {
final Parser parser,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream sourceStream
) throws Parser.Failure {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
@ -275,7 +279,7 @@ public final class TextParser {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
try {
final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, sourceStream);
final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream);
return docs;
} catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location);
@ -288,6 +292,7 @@ public final class TextParser {
final Set<Parser> parsers,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
final byte[] sourceArray
) throws Parser.Failure {
@ -310,7 +315,7 @@ public final class TextParser {
bis = new ByteArrayInputStream(sourceArray);
}
try {
docs = parser.parse(location, mimeType, documentCharset, scraper, bis);
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
} catch (final Parser.Failure e) {
failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);

@ -107,7 +107,7 @@ public class DCEntry extends MultiMapSolrParams {
if (d == null) return null;
if (d.isEmpty()) return null;
try {
Date x = ISO8601Formatter.FORMATTER.parse(d);
Date x = ISO8601Formatter.FORMATTER.parse(d, 0).getTime();
Date now = new Date();
return x.after(now) ? now : x;
} catch (final ParseException e) {

@ -524,7 +524,7 @@ public class MediawikiImporter extends Thread implements Importer {
public void genDocument() throws Parser.Failure {
try {
this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 1, UTF8.getBytes(this.html));
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title);

@ -158,7 +158,7 @@ public class ResumptionToken extends TreeMap<String, String> {
final String d = get("expirationDate");
if (d == null) return null;
try {
return ISO8601Formatter.FORMATTER.parse(d);
return ISO8601Formatter.FORMATTER.parse(d, 0).getTime();
} catch (final ParseException e) {
ConcurrentLog.logException(e);
return new Date();

@ -54,7 +54,13 @@ public class apkParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
/*
* things to discover:

@ -70,8 +70,13 @@ public class audioTagParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
String filename = location.getFileName();

@ -38,13 +38,19 @@ public class AugmentParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(AnchorURL url, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, scraper, source);
Document[] htmlDocs = this.rdfaParser.parse(location, mimeType, charset, scraper, timezoneOffset, source);
for (final Document doc : htmlDocs) {
/* analyze(doc, url, mimeType, charset); // enrich document text */
parseAndAugment(doc, url, mimeType, charset); // enrich document with additional tags
parseAndAugment(doc, location, mimeType, charset); // enrich document with additional tags
}
return htmlDocs;
}

@ -57,8 +57,13 @@ public class bzipParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
File tempFile = null;
@ -95,7 +100,7 @@ public class bzipParser extends AbstractParser implements Parser {
out.close();
// creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile);
docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -53,7 +53,13 @@ public class csvParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
// construct a document using all cells of the document
// the first row is used as headline
// all lines are artificially terminated by a '.' to separate them as sentence for the condenser.

@ -59,8 +59,13 @@ public class docParser extends AbstractParser implements Parser {
@SuppressWarnings("deprecation")
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
final WordExtractor extractor;

@ -61,7 +61,13 @@ public class dwgParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, true))

@ -46,8 +46,13 @@ public class genericParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source1)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
String filename = location.getFileName();
final Document[] docs = new Document[]{new Document(

@ -56,7 +56,13 @@ public class gzipParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null;
Document[] docs = null;
@ -80,7 +86,7 @@ public class gzipParser extends AbstractParser implements Parser {
out.close();
// creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile);
docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -188,6 +188,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private AnchorURL canonical, publisher;
private final int maxLinks;
private final VocabularyScraper vocabularyScraper;
private final int timezoneOffset;
private int breadcrumbs;
@ -213,7 +214,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param classDetector a map from class names to vocabulary names to scrape content from the DOM with associated class name
*/
@SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper) {
public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
@ -221,6 +222,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.root = root;
this.maxLinks = maxLinks;
this.vocabularyScraper = vocabularyScraper;
this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
this.css = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -389,12 +391,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (content != null) {
if ("startDate".equals(itemprop)) try {
// parse ISO 8601 date
Date startDate = ISO8601Formatter.FORMATTER.parse(content);
Date startDate = ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();
this.startDates.add(startDate);
} catch (ParseException e) {}
if ("endDate".equals(itemprop)) try {
// parse ISO 8601 date
Date endDate = ISO8601Formatter.FORMATTER.parse(content);
Date endDate = ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();
this.endDates.add(endDate);
} catch (ParseException e) {}
}
@ -651,7 +653,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// start a new scraper to parse links inside this text
// parsing the content
final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper);
final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper, this.timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
try {
FileUtils.copy(new CharArrayReader(inlineHtml), writer);
@ -1003,19 +1005,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// <meta name="date" content="YYYY-MM-DD..." />
content = this.metas.get("date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {}
// <meta name="DC.date" content="YYYY-MM-DD" />
content = this.metas.get("dc.date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {}
// <meta name="DC:date" content="YYYY-MM-DD" />
content = this.metas.get("dc:date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {}
// <meta http-equiv="last-modified" content="YYYY-MM-DD" />
content = this.metas.get("last-modified");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {}
return new Date();
}
@ -1153,19 +1155,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
public static ContentScraper parseResource(final File file, final int maxLinks) throws IOException {
public static ContentScraper parseResource(final File file, final int maxLinks, final int timezoneOffset) throws IOException {
// load page
final byte[] page = FileUtils.read(file);
if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks);
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks, timezoneOffset);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper());
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close();

@ -64,13 +64,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
final DigestURL rooturl,
final Transformer transformer,
final boolean passbyIfBinarySuspect,
final int maxLinks
final int maxLinks,
final int timezoneOffset
) {
// create a input stream for buffereing
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
this.bufferedIn.mark((int) preBufferSize);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper, timezoneOffset);
scraper.registerHtmlFilterEventListener(this);
try {

@ -87,13 +87,15 @@ public class htmlParser extends AbstractParser implements Parser {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String documentCharset, final VocabularyScraper vocscraper,
final String documentCharset,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
try {
// first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null};
final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, sourceStream, maxLinks);
final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
@ -151,7 +153,7 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd;
}
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, String input, int maxLinks) throws IOException {
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream;
try {
@ -161,7 +163,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
ContentScraper scraper;
try {
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, sourceStream, maxLinks);
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
} catch (Failure e) {
throw new IOException(e.getMessage());
}
@ -173,6 +175,7 @@ public class htmlParser extends AbstractParser implements Parser {
final String documentCharset,
final VocabularyScraper vocabularyScraper,
Charset[] detectedcharsetcontainer,
final int timezoneOffset,
InputStream sourceStream,
final int maxLinks) throws Parser.Failure, IOException {
@ -188,7 +191,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (charset == null) {
ScraperInputStream htmlFilter = null;
try {
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks);
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks, timezoneOffset);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
} catch (final IOException e1) {
@ -222,7 +225,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
// parsing the content
final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper);
final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
@ -324,7 +327,7 @@ public class htmlParser extends AbstractParser implements Parser {
try {
url = new AnchorURL(args[0]);
final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null);
final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), new ByteArrayInputStream(content));
final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), 0, new ByteArrayInputStream(content));
final String title = document[0].dc_title();
System.out.println(title);
} catch (final MalformedURLException e) {

@ -93,8 +93,10 @@ public class genericImageParser extends AbstractParser implements Parser {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String documentCharset, final VocabularyScraper scraper,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
ImageInfo ii = null;
String title = null;
@ -108,7 +110,7 @@ public class genericImageParser extends AbstractParser implements Parser {
if (mimeType.equals("image/bmp") || ext.equals("bmp")) {
byte[] b;
try {
b = FileUtils.read(sourceStream);
b = FileUtils.read(source);
} catch (final IOException e) {
ConcurrentLog.logException(e);
throw new Parser.Failure(e.getMessage(), location);
@ -126,7 +128,7 @@ public class genericImageParser extends AbstractParser implements Parser {
// a tutorial is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/sampleUsage.html
byte[] b;
try {
b = FileUtils.read(sourceStream);
b = FileUtils.read(source);
} catch (final IOException e) {
ConcurrentLog.logException(e);
throw new Parser.Failure(e.getMessage(), location);
@ -182,7 +184,7 @@ public class genericImageParser extends AbstractParser implements Parser {
// just ignore
}
} else {
ii = parseJavaImage(location, sourceStream);
ii = parseJavaImage(location, source);
}
final HashSet<String> languages = new HashSet<String>();
@ -315,7 +317,7 @@ public class genericImageParser extends AbstractParser implements Parser {
AnchorURL uri;
try {
uri = new AnchorURL("http://localhost/" + image.getName());
final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), new FileInputStream(image));
final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), 0, new FileInputStream(image));
System.out.println(document[0].toString());
} catch (final MalformedURLException e) {
e.printStackTrace();

@ -87,8 +87,10 @@ public class metadataImageParser extends AbstractParser implements Parser {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String documentCharset, final VocabularyScraper scraper,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
String title = null;
String author = null;
@ -99,7 +101,7 @@ public class metadataImageParser extends AbstractParser implements Parser {
StringBuilder imgInfotxt = new StringBuilder();
try {
final Metadata metadata = ImageMetadataReader.readMetadata(new BufferedInputStream(sourceStream));
final Metadata metadata = ImageMetadataReader.readMetadata(new BufferedInputStream(source));
final Iterator<Directory> directories = metadata.getDirectories().iterator();
final HashMap<String, String> props = new HashMap<String, String>();
@ -160,7 +162,7 @@ public class metadataImageParser extends AbstractParser implements Parser {
return new Document[]{new Document(
location,
mimeType,
documentCharset,
charset,
this,
new HashSet<String>(0), // languages
keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords

@ -59,11 +59,16 @@ public class linkScraperParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("text/sgml");
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, source);
Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, timezoneOffset, source);
Document htmlParserDoc = htmlParserDocs == null ? null : Document.mergeDocuments(location, mimeType, htmlParserDocs);

@ -71,8 +71,13 @@ public class mmParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException
{
final StringBuilder sb = new StringBuilder();

@ -216,7 +216,13 @@ public class odtParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null;
try {
// creating a tempfile

@ -202,7 +202,13 @@ public class ooxmlParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null;
try {
// creating a tempfile

@ -86,7 +86,13 @@ public class pdfParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false))
@ -376,7 +382,7 @@ public class pdfParser extends AbstractParser implements Parser {
final AbstractParser parser = new pdfParser();
Document document = null;
try {
document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), new FileInputStream(pdfFile)));
document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), 0, new FileInputStream(pdfFile)));
} catch (final Parser.Failure e) {
System.err.println("Cannot parse file " + pdfFile.getAbsolutePath());
ConcurrentLog.logException(e);

@ -64,8 +64,13 @@ public class pptParser extends AbstractParser implements Parser {
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure,
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure,
InterruptedException {
try {
/*

@ -258,8 +258,13 @@ public class psParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
File tempFile = null;

@ -46,8 +46,13 @@ public class rdfParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException {
@ -60,7 +65,7 @@ public class rdfParser extends AbstractParser implements Parser {
Document doc;
String all = "rdfdatasource";
doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
doc = new Document(location, mimeType, charset, null, null, null, singleList(""), "",
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false, new Date());
docs.add(doc);

@ -48,11 +48,16 @@ public class RDFaParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(AnchorURL url, String mimeType,
String charset, final VocabularyScraper scraper, InputStream source) throws Failure,
public Document[] parse(
final AnchorURL url,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Failure,
InterruptedException {
Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, source);
Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, timezoneOffset, source);
// TODO: current hardcoded restriction: apply rdfa parser only on selected sources.
@ -97,13 +102,18 @@ public class RDFaParser extends AbstractParser implements Parser {
return doc;
}
private Document[] parseHtml(AnchorURL url, String mimeType,
String charset, VocabularyScraper scraper, InputStream source) throws Failure,
private Document[] parseHtml(
final AnchorURL url,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Failure,
InterruptedException {
Document[] htmlDocs = null;
try {
htmlDocs = this.hp.parse(url, mimeType, charset, scraper, source);
htmlDocs = this.hp.parse(url, mimeType, charset, scraper, timezoneOffset, source);
source.reset();
} catch (final IOException e1) {
@ -180,7 +190,7 @@ public class RDFaParser extends AbstractParser implements Parser {
if (aReader != null) {
RDFaParser aParser = new RDFaParser();
try {
aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), aURL.openStream());
aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), 0, aURL.openStream());
} catch (final FileNotFoundException e) {
e.printStackTrace();
} catch (final IOException e) {

@ -59,14 +59,19 @@ public class rssParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL feedurl, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException {
RSSReader rssReader;
try {
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source);
} catch (final IOException e) {
throw new Parser.Failure("Load error:" + e.getMessage(), feedurl, e);
throw new Parser.Failure("Load error:" + e.getMessage(), location, e);
}
final RSSFeed feed = rssReader.getFeed();

@ -53,8 +53,13 @@ public class rtfParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
try {

@ -56,7 +56,12 @@ public class sevenzipParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
}
public Document parse(final AnchorURL location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException {
public Document parse(
final AnchorURL location,
final String mimeType,
final String charset,
final int timezoneOffset,
final IInStream source) throws Parser.Failure, InterruptedException {
final Document doc = new Document(
location,
mimeType,
@ -83,7 +88,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
} catch (final IOException e) {
throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location);
}
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile());
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), timezoneOffset);
AbstractParser.log.fine("processing archive contents...");
try {
archive.Extract(null, -1, 0, aec);
@ -101,16 +106,27 @@ public class sevenzipParser extends AbstractParser implements Parser {
}
}
public Document parse(final AnchorURL location, final String mimeType, final String charset, final byte[] source) throws Parser.Failure, InterruptedException {
return parse(location, mimeType, charset, new ByteArrayIInStream(source));
public Document parse(
final AnchorURL location,
final String mimeType,
final String charset,
final int timezoneOffset,
final byte[] source) throws Parser.Failure, InterruptedException {
return parse(location, mimeType, charset, timezoneOffset, new ByteArrayIInStream(source));
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos);
return new Document[]{parse(location, mimeType, charset, cfos.toByteArray())};
return new Document[]{parse(location, mimeType, charset, timezoneOffset, cfos.toByteArray())};
} catch (final IOException e) {
throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location);
}
@ -124,13 +140,19 @@ public class sevenzipParser extends AbstractParser implements Parser {
private ByteArrayOutputStream cfos = null;
private final Document doc;
private final String prefix;
private final int timezoneOffset;
public SZParserExtractCallback(final ConcurrentLog logger, final IInArchive handler,
final Document doc, final String prefix) {
public SZParserExtractCallback(
final ConcurrentLog logger,
final IInArchive handler,
final Document doc,
final String prefix,
final int timezoneOffset) {
super.Init(handler);
this.log = logger;
this.doc = doc;
this.prefix = prefix;
this.timezoneOffset = timezoneOffset;
}
@Override
@ -172,7 +194,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// below for reversion of the effects
final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), this.doc.getDepth() + 1, this.cfos.toByteArray());
theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs);
}

@ -58,8 +58,13 @@ public class sidAudioParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
try {
final int available = source.available();

@ -70,8 +70,13 @@ public class sitemapParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException {
final List<Document> docs = new ArrayList<Document>();
SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent);
@ -83,7 +88,7 @@ public class sitemapParser extends AbstractParser implements Parser {
uri = new DigestURL(item.loc);
doc = new Document(
uri,
TextParser.mimeOf(url),
TextParser.mimeOf(location),
charset,
this,
null,
@ -224,7 +229,7 @@ public class sitemapParser extends AbstractParser implements Parser {
public Date lastmod(final Date dflt) {
try {
return ISO8601Formatter.FORMATTER.parse(this.lastmod);
return ISO8601Formatter.FORMATTER.parse(this.lastmod, 0).getTime();
} catch (final ParseException e) {
return dflt;
}
@ -245,7 +250,7 @@ public class sitemapParser extends AbstractParser implements Parser {
public Date lastmod(final Date dflt) {
try {
return ISO8601Formatter.FORMATTER.parse(this.lastmod);
return ISO8601Formatter.FORMATTER.parse(this.lastmod, 0).getTime();
} catch (final ParseException e) {
return dflt;
}

@ -56,8 +56,13 @@ public class swfParser extends AbstractParser implements Parser {
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException
{

@ -62,16 +62,22 @@ public class tarParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
InputStream source) throws Parser.Failure, InterruptedException {
final List<Document> docacc = new ArrayList<Document>();
Document[] subDocs = null;
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
final String ext = MultiProtocolURL.getFileExtension(location.getFileName());
if (ext.equals("gz") || ext.equals("tgz")) {
try {
source = new GZIPInputStream(source);
} catch (final IOException e) {
throw new Parser.Failure("tar parser: " + e.getMessage(), url);
throw new Parser.Failure("tar parser: " + e.getMessage(), location);
}
}
TarEntry entry;
@ -91,7 +97,7 @@ public class tarParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, scraper, 999, tmp);
subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp);
if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) {

@ -57,7 +57,13 @@ public class torrentParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
byte[] b = null;
try {
@ -120,8 +126,8 @@ public class torrentParser extends AbstractParser implements Parser {
try {
byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser();
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false);
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), 0, new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false, 0);
Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (final IOException e) {

@ -66,7 +66,13 @@ public class vcfParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
try {
@ -201,7 +207,7 @@ public class vcfParser extends AbstractParser implements Parser {
} else {
if (AbstractParser.log.isFinest()) AbstractParser.log.finest("Invalid data in vcf file" +
"\n\tURL: " + url +
"\n\tURL: " + location +
"\n\tLine: " + line +
"\n\tLine-Nr: " + lineNr);
}
@ -212,7 +218,7 @@ public class vcfParser extends AbstractParser implements Parser {
final byte[] text = UTF8.getBytes(parsedDataText.toString());
final List<String> descriptions = new ArrayList<String>(1); descriptions.add("vCard");
return new Document[]{new Document(
url, // url of the source document
location, // url of the source document
mimeType, // the documents mime type
null, // charset
this,
@ -234,7 +240,7 @@ public class vcfParser extends AbstractParser implements Parser {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
throw new Parser.Failure("Unexpected error while parsing vcf resource. " + e.getMessage(),url);
throw new Parser.Failure("Unexpected error while parsing vcf resource. " + e.getMessage(), location);
}
}

@ -67,7 +67,13 @@ public class vsdParser extends AbstractParser implements Parser {
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
Document theDoc = null;

@ -68,8 +68,13 @@ public class xlsParser extends AbstractParser implements Parser {
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure,
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure,
InterruptedException {
return new XLSHSSFListener().parse(location, mimeType, charset, source);
}

@ -62,12 +62,17 @@ public class zipParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final VocabularyScraper scraper, final InputStream source)
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false))
throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), url);
throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), location);
Document[] docs = null;
final List<Document> docacc = new ArrayList<Document>();
@ -88,9 +93,9 @@ public class zipParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(zis, tmp, entry.getSize());
final DigestURL virtualURL = DigestURL.newURL(url, "#" + name);
final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, 999, tmp);
docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp);
if (docs == null) continue;
for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) {

@ -74,7 +74,8 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler
"",
cachedResponseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
0);
0,
sb.crawler.defaultProxyProfile.timezoneOffset());
final Response cachedResponse = new Response(
yacyRequest,

@ -180,7 +180,8 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
"",
responseHeaderLegacy.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
0); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete);
0,
sb.crawler.defaultProxyProfile.timezoneOffset()); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete);
final Response yacyResponse = new Response(
yacyRequest,
null,

@ -137,7 +137,7 @@ public class SolrSelectServlet extends HttpServlet {
if (!mmsp.getMap().containsKey(CommonParams.Q) && mmsp.getMap().containsKey(CommonParams.QUERY)) {
querystring = mmsp.get(CommonParams.QUERY, "");
mmsp.getMap().remove(CommonParams.QUERY);
QueryModifier modifier = new QueryModifier();
QueryModifier modifier = new QueryModifier(0);
querystring = modifier.parse(querystring);
modifier.apply(mmsp);
QueryGoal qg = new QueryGoal(querystring);

@ -172,7 +172,7 @@ public class ArrayStack implements BLOB {
f.delete();
deletions = true;
} else try {
d = GenericFormatter.SHORT_SECOND_FORMATTER.parse(file.substring(0, 14));
d = GenericFormatter.SHORT_SECOND_FORMATTER.parse(file.substring(0, 14), 0).getTime();
f.renameTo(newBLOB(d));
deletions = true;
} catch (final ParseException e) {continue;}
@ -188,7 +188,7 @@ public class ArrayStack implements BLOB {
for (final String file : files) {
if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) {
try {
d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18));
d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime();
time = d.getTime();
if (time > maxtime) maxtime = time;
} catch (final ParseException e) {continue;}
@ -199,7 +199,7 @@ public class ArrayStack implements BLOB {
for (final String file : files) {
if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) {
try {
d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18));
d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime();
f = new File(heapLocation, file);
time = d.getTime();
try {
@ -253,7 +253,7 @@ public class ArrayStack implements BLOB {
public synchronized void mountBLOB(final File location, final boolean full) throws IOException {
Date d;
try {
d = my_SHORT_MILSEC_FORMATTER.parse(location.getName().substring(this.prefix.length() + 1, this.prefix.length() + 18));
d = my_SHORT_MILSEC_FORMATTER.parse(location.getName().substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime();
} catch (final ParseException e) {
throw new IOException("date parse problem with file " + location.toString() + ": " + e.getMessage());
}

@ -95,7 +95,7 @@ public class BEncodedHeapBag extends AbstractMapStore implements MapStore {
(element.length() == this.prefix.length() + 23)) {
f = new File(this.baseDir, element);
try {
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18));
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime();
} catch (final ParseException e) {
ConcurrentLog.severe("BEncodedHeapBag", "", e);
continue;
@ -203,7 +203,7 @@ public class BEncodedHeapBag extends AbstractMapStore implements MapStore {
final String name = heap.getFile().getName();
long d;
try {
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18)).getTime();
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime().getTime();
} catch (final ParseException e) {
ConcurrentLog.severe("BEncodedHeapBag", "", e);
d = 0;

@ -764,7 +764,7 @@ public class Tables implements Iterable<String> {
final byte[] r = this.get(colname);
if (r == null) return dflt;
try {
return my_SHORT_MILSEC_FORMATTER.parse(UTF8.String(r));
return my_SHORT_MILSEC_FORMATTER.parse(UTF8.String(r), 0).getTime();
} catch (final ParseException e) {
return dflt;
}

@ -107,17 +107,17 @@ public class URIMetadataNode extends SolrDocument {
final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute);
try {
this.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101")));
this.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"), 0).getTime());
} catch (final ParseException e) {
this.setField(CollectionSchema.last_modified.name(), new Date());
}
try {
this.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101")));
this.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"), 0).getTime());
} catch (final ParseException e) {
this.setField(CollectionSchema.load_date_dt.name(), new Date());
}
try {
this.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101")));
this.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"), 0).getTime());
} catch (final ParseException e) {
this.setField(CollectionSchema.fresh_date_dt.name(), new Date());
}

@ -179,7 +179,7 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
(element.length() == this.prefix.length() + 24)) {
f = new File(this.path, element);
try {
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18));
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime();
} catch (final ParseException e) {
ConcurrentLog.severe("SplitTable", "", e);
continue;
@ -372,7 +372,7 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
final String name = new File(table.filename()).getName();
long d;
try {
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18)).getTime();
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18), 0).getTime().getTime();
} catch (final ParseException e) {
ConcurrentLog.severe("SplitTable", "", e);
d = 0;

@ -46,6 +46,8 @@ package net.yacy.peers;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
@ -164,10 +166,16 @@ public class NewsDB {
private Record b2r(final Row.Entry b) {
if (b == null) return null;
Calendar c;
try {
c = b.empty(2) ? null : my_SHORT_SECOND_FORMATTER.parse(b.getColASCII(2), 0);
} catch (ParseException e) {
c = null;
}
return new NewsDB.Record(
b.getPrimaryKeyASCII(),
b.getColUTF8(1),
(b.empty(2)) ? null : my_SHORT_SECOND_FORMATTER.parse(b.getColASCII(2), GenericFormatter.UTCDiffString()),
c == null ? null : c.getTime(),
(int) b.getColLong(3),
MapTools.string2map(b.getColUTF8(4), ",")
);
@ -226,8 +234,8 @@ public class NewsDB {
public class Record {
private final String originator; // hash of originating peer
private final Date created; // Date when news was created by originator
private final Date received; // Date when news was received here at this peer
private Date created; // Date when news was created by originator
private Date received; // Date when news was received here at this peer
private final String category; // keyword that addresses possible actions
private int distributed; // counter that counts number of distributions of this news record
private final Map<String, String> attributes; // elements of the news for a special category
@ -238,8 +246,16 @@ public class NewsDB {
if (this.attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + this.attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")");
this.category = (this.attributes.containsKey("cat")) ? this.attributes.get("cat") : "";
if (this.category.length() > NewsDB.categoryStringLength) throw new IllegalArgumentException("category length (" + this.category.length() + ") exceeds maximum (" + NewsDB.categoryStringLength + ")");
this.received = (this.attributes.containsKey("rec")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("rec"), GenericFormatter.UTCDiffString()) : new Date();
this.created = (this.attributes.containsKey("cre")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("cre"), GenericFormatter.UTCDiffString()) : new Date();
try {
this.received = (this.attributes.containsKey("rec")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("rec"), 0).getTime() : new Date();
} catch (ParseException e) {
this.received = new Date();
}
try {
this.created = (this.attributes.containsKey("cre")) ? my_SHORT_SECOND_FORMATTER.parse(this.attributes.get("cre"), 0).getTime() : new Date();
} catch (ParseException e) {
this.created = new Date();
}
this.distributed = (this.attributes.containsKey("dis")) ? Integer.parseInt(this.attributes.get("dis")) : 0;
this.originator = (this.attributes.containsKey("ori")) ? this.attributes.get("ori") : "";
removeStandards();
@ -262,7 +278,11 @@ public class NewsDB {
if (attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")");
this.attributes = attributes;
this.received = received;
this.created = my_SHORT_SECOND_FORMATTER.parse(id.substring(0, GenericFormatter.PATTERN_SHORT_SECOND.length()), GenericFormatter.UTCDiffString());
try {
this.created = my_SHORT_SECOND_FORMATTER.parse(id.substring(0, GenericFormatter.PATTERN_SHORT_SECOND.length()), 0).getTime();
} catch (ParseException e) {
this.created = new Date();
}
this.category = category;
this.distributed = distributed;
this.originator = id.substring(GenericFormatter.PATTERN_SHORT_SECOND.length());

@ -797,7 +797,7 @@ public class Seed implements Cloneable, Comparable<Seed>, Comparator<Seed>
try {
final GenericFormatter my_SHORT_SECOND_FORMATTER =
new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); // use our own formatter to prevent concurrency locks with other processes
final long t = my_SHORT_SECOND_FORMATTER.parse(get(Seed.LASTSEEN, "20040101000000")).getTime();
final long t = my_SHORT_SECOND_FORMATTER.parse(get(Seed.LASTSEEN, "20040101000000"), 0).getTime().getTime();
// getTime creates a UTC time number. But in this case java thinks, that the given
// time string is a local time, which has a local UTC offset applied.
// Therefore java subtracts the local UTC offset, to get a UTC number.
@ -831,7 +831,7 @@ public class Seed implements Cloneable, Comparable<Seed>, Comparator<Seed>
try {
final GenericFormatter my_SHORT_SECOND_FORMATTER =
new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); // use our own formatter to prevent concurrency locks with other processes
b = my_SHORT_SECOND_FORMATTER.parse(get(Seed.BDATE, "20040101000000")).getTime();
b = my_SHORT_SECOND_FORMATTER.parse(get(Seed.BDATE, "20040101000000"), 0).getTime().getTime();
} catch (final ParseException e ) {
b = System.currentTimeMillis();
}

@ -503,7 +503,7 @@ public class WebStructureGraph {
hr =
new HostReference(
ASCII.getBytes(sentry.hosthash),
GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(),
GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date, 0).getTime().getTime(),
refhosthashandcounter.getValue().intValue());
} catch (final ParseException e ) {
continue refloop;

@ -112,21 +112,24 @@ public final class LoaderDispatcher {
final boolean forText,
final boolean global
) {
CrawlProfile profile =
(forText) ?
((global) ?
this.sb.crawler.defaultTextSnippetGlobalProfile :
this.sb.crawler.defaultTextSnippetLocalProfile)
:
((global) ?
this.sb.crawler.defaultMediaSnippetGlobalProfile :
this.sb.crawler.defaultMediaSnippetLocalProfile);
return new Request(
ASCII.getBytes(this.sb.peers.mySeed().hash),
url,
null,
"",
new Date(),
(forText) ?
((global) ?
this.sb.crawler.defaultTextSnippetGlobalProfile.handle() :
this.sb.crawler.defaultTextSnippetLocalProfile.handle())
:
((global) ?
this.sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
this.sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0);
profile.handle(),
0,
profile.timezoneOffset());
}
public void load(final DigestURL url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
@ -407,7 +410,7 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
*/
public final Map<AnchorURL, String> loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
public final Map<AnchorURL, String> loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent, final int timezoneOffset) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent);
if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader();
@ -418,7 +421,7 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.mime());
if (supportError != null) throw new IOException("no parser support: " + supportError);
try {
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), response.depth(), response.getContent());
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent());
if (documents == null) throw new IOException("document == null");
} catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage());

@ -152,7 +152,7 @@ public class EventTracker {
}
public long getTime() {
if (this.time instanceof String) try {
return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time).getTime();
return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time, 0).getTime().getTime();
} catch (ParseException e) {
return -1L;
}
@ -162,7 +162,7 @@ public class EventTracker {
}
public Date getDate() {
if (this.time instanceof String) try {
return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time);
return GenericFormatter.SHORT_SECOND_FORMATTER.parse((String) this.time, 0).getTime();
} catch (ParseException e) {
return null;
}if (this.time instanceof Long) return new Date((Long) this.time);

@ -1942,7 +1942,8 @@ public final class Switchboard extends serverSwitch {
"",
surrogate.getDate(),
this.crawler.defaultSurrogateProfile.handle(),
0);
0,
this.crawler.defaultSurrogateProfile.timezoneOffset());
response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false, null);
final IndexingQueueEntry queueEntry =
new IndexingQueueEntry(response, new Document[] {document}, null);
@ -2571,6 +2572,7 @@ public final class Switchboard extends serverSwitch {
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),
response.getContent());
if ( documents == null ) {
@ -2673,7 +2675,8 @@ public final class Switchboard extends serverSwitch {
nextEntry.getValue(),
new Date(),
response.profile().handle(),
nextdepth));
nextdepth,
response.profile().timezoneOffset()));
} catch (final MalformedURLException e ) {
ConcurrentLog.logException(e);
}
@ -2754,7 +2757,8 @@ public final class Switchboard extends serverSwitch {
in.documents[i], in.queueEntry.profile().scraper(), in.queueEntry.profile().indexText(),
in.queueEntry.profile().indexMedia(),
LibraryProvider.dymLib, true,
this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts));
this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts),
profile.timezoneOffset());
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
@ -3043,7 +3047,15 @@ public final class Switchboard extends serverSwitch {
int p = userInfo == null ? -1 : userInfo.indexOf(':');
String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p);
String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1);
this.crawlStacker.enqueueEntriesFTP(this.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), user, pw, false);
this.crawlStacker.enqueueEntriesFTP(
this.peers.mySeed().hash.getBytes(),
profile.handle(),
url.getHost(),
url.getPort(),
user,
pw,
false,
profile.timezoneOffset());
return null;
} catch (final Exception e) {
// mist
@ -3080,7 +3092,8 @@ public final class Switchboard extends serverSwitch {
"CRAWLING-ROOT",
new Date(),
profile.handle(),
0
0,
profile.timezoneOffset()
));
if (reasonString != null) return reasonString;
@ -3134,7 +3147,7 @@ public final class Switchboard extends serverSwitch {
* @throws IOException
* @throws Parser.Failure
*/
public void addToIndex(final Collection<DigestURL> urls, final SearchEvent searchEvent, final String heuristicName, final Map<String, Pattern> collections, boolean doublecheck) {
public void addToIndex(final Collection<DigestURL> urls, final SearchEvent searchEvent, final String heuristicName, final Map<String, Pattern> collections, final boolean doublecheck) {
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
if (searchEvent != null) {
@ -3192,7 +3205,7 @@ public final class Switchboard extends serverSwitch {
}
final Condenser condenser = new Condenser(
document, null, true, true, LibraryProvider.dymLib, true,
Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts));
Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts), searchEvent.query.timezoneOffset);
ResultImages.registerImages(url, document, true);
Switchboard.this.webStructure.generateCitationReference(url, document);
storeDocumentIndex(
@ -3546,7 +3559,7 @@ public final class Switchboard extends serverSwitch {
final Map<AnchorURL, String> links;
searchEvent.oneFeederStarted();
try {
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent, searchEvent.query.timezoneOffset);
if ( links != null ) {
final Iterator<AnchorURL> i = links.keySet().iterator();
while ( i.hasNext() ) {
@ -3585,7 +3598,7 @@ public final class Switchboard extends serverSwitch {
final Map<AnchorURL, String> links;
DigestURL url;
try {
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent, 0);
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<AnchorURL> i = links.keySet().iterator();

@ -61,18 +61,27 @@ public class DocumentIndex extends Segment {
} catch (final MalformedURLException e ) {
}
}
BlockingQueue<AnchorURL> queue; // a queue of document ID's
private BlockingQueue<AnchorURL> queue; // a queue of document ID's
private final Worker[] worker;
CallbackListener callback;
private CallbackListener callback;
private int timezoneOffset;
static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup");
public DocumentIndex(final File segmentPath, final File archivePath, final File collectionConfigurationPath, final File webgraphConfigurationPath, final CallbackListener callback, final int cachesize)
public DocumentIndex(
final File segmentPath,
final File archivePath,
final File collectionConfigurationPath,
final File webgraphConfigurationPath,
final CallbackListener callback,
final int cachesize,
final int timezoneOffset)
throws IOException {
super(new ConcurrentLog("DocumentIndex"), segmentPath, archivePath,
collectionConfigurationPath == null ? null : new CollectionConfiguration(collectionConfigurationPath, true),
webgraphConfigurationPath == null ? null : new WebgraphConfiguration(webgraphConfigurationPath, true)
);
this.timezoneOffset = timezoneOffset;
super.connectRWI(cachesize, targetFileSize * 4 - 1);
super.connectCitation(cachesize, targetFileSize * 4 - 1);
super.fulltext().connectLocalSolr();
@ -99,7 +108,7 @@ public class DocumentIndex extends Segment {
try {
while ( (f = DocumentIndex.this.queue.take()) != poison ) {
try {
resultRows = add(f);
resultRows = add(f, DocumentIndex.this.timezoneOffset);
for ( final SolrInputDocument resultRow : resultRows ) {
if ( DocumentIndex.this.callback != null ) {
if ( resultRow == null ) {
@ -132,7 +141,7 @@ public class DocumentIndex extends Segment {
this.queue.clear();
}
private SolrInputDocument[] add(final AnchorURL url) throws IOException {
private SolrInputDocument[] add(final AnchorURL url, final int timezoneOffset) throws IOException {
if ( url == null ) {
throw new IOException("file = null");
}
@ -150,7 +159,7 @@ public class DocumentIndex extends Segment {
length = -1;
}
try {
documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null));
documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), timezoneOffset, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null));
} catch (final Exception e ) {
throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
}
@ -159,7 +168,7 @@ public class DocumentIndex extends Segment {
int c = 0;
for ( final Document document : documents ) {
if (document == null) continue;
final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true);
final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true, 0);
rows[c++] =
super.storeDocument(
url,

@ -761,7 +761,7 @@ public class Segment {
}
// get the word set
Set<String> words = null;
words = new Condenser(document, null, true, true, null, false, false).words().keySet();
words = new Condenser(document, null, true, true, null, false, false, 0).words().keySet();
// delete all word references
int count = 0;

@ -315,7 +315,7 @@ public class AccessTracker {
byte[] b = new byte[GenericFormatter.PATTERN_SHORT_SECOND.length()];
raf.readFully(b);
try {
return GenericFormatter.SHORT_SECOND_FORMATTER.parse(UTF8.String(b));
return GenericFormatter.SHORT_SECOND_FORMATTER.parse(UTF8.String(b), 0).getTime();
} catch (ParseException e) {
throw new IOException(e.getMessage());
}
@ -326,8 +326,8 @@ public class AccessTracker {
String file = args[0];
Date from;
try {
from = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[1]);
Date to = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[2]);
from = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[1], 0).getTime();
Date to = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[2], 0).getTime();
List<EventTracker.Event> dump = readLog(new File(file), from, to);
for (EventTracker.Event s: dump) System.out.println(s.toString());
} catch (ParseException e) {

@ -41,8 +41,10 @@ public class QueryModifier {
private final StringBuilder modifier;
public String sitehost, sitehash, filetype, protocol, language, author, collection, on, from, to;
public int timezoneOffset;
public QueryModifier() {
public QueryModifier(final int timezoneOffset) {
this.timezoneOffset = timezoneOffset;
this.sitehash = null;
this.sitehost = null;
this.filetype = null;
@ -274,19 +276,19 @@ public class QueryModifier {
if (fq.indexOf(CollectionSchema.dates_in_content_dts.getSolrFieldName()) < 0) {
if (this.on != null && this.on.length() > 0) {
fq.append(" AND ").append(QueryModifier.parseOnExpression(this.on));
fq.append(" AND ").append(QueryModifier.parseOnExpression(this.on, this.timezoneOffset));
}
if (this.from != null && this.from.length() > 0 && (this.to == null || this.to.equals("*"))) {
fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, null));
fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, null, this.timezoneOffset));
}
if ((this.from == null || this.from.equals("*")) && this.to != null && this.to.length() > 0) {
fq.append(" AND ").append(QueryModifier.parseFromToExpression(null, this.to));
fq.append(" AND ").append(QueryModifier.parseFromToExpression(null, this.to, this.timezoneOffset));
}
if (this.from != null && this.from.length() > 0 && this.to != null && this.to.length() > 0) {
fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, this.to));
fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, this.to, this.timezoneOffset));
}
}
@ -348,9 +350,9 @@ public class QueryModifier {
return fq.toString();
}
public static String parseOnExpression(String onDescription) {
public static String parseOnExpression(final String onDescription, final int timezoneOffset) {
assert onDescription != null;
Date onDate = DateDetection.parseLine(onDescription);
Date onDate = DateDetection.parseLine(onDescription, timezoneOffset);
StringBuilder filterQuery = new StringBuilder(20);
if (onDate != null) {
@SuppressWarnings({ "deprecation", "static-access" })
@ -360,9 +362,9 @@ public class QueryModifier {
return filterQuery.toString();
}
public static String parseFromToExpression(String from, String to) {
Date fromDate = from == null || from.equals("*") ? null : DateDetection.parseLine(from);
Date toDate = to == null || to.equals("*") ? null : DateDetection.parseLine(to);
public static String parseFromToExpression(final String from, final String to, final int timezoneOffset) {
Date fromDate = from == null || from.equals("*") ? null : DateDetection.parseLine(from, timezoneOffset);
Date toDate = to == null || to.equals("*") ? null : DateDetection.parseLine(to, timezoneOffset);
StringBuilder filterQuery = new StringBuilder(20);
if (fromDate != null && toDate != null) {
@SuppressWarnings({ "deprecation", "static-access" })

@ -70,7 +70,6 @@ import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.DisMaxParams;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.schema.TrieDateField;
@ -146,6 +145,7 @@ public final class QueryParams {
public LinkedHashSet<String> facetfields;
private SolrQuery cachedQuery;
private CollectionConfiguration solrSchema;
public final int timezoneOffset;
public QueryParams(
final QueryGoal queryGoal,
@ -154,6 +154,7 @@ public final class QueryParams {
final String prefer,
final ContentDomain contentdom,
final String language,
final int timezoneOffset,
final Collection<Tagging.Metatag> metatags,
final CacheStrategy snippetCacheStrategy,
final int itemsPerPage,
@ -183,6 +184,7 @@ public final class QueryParams {
this.ranking = ranking;
this.maxDistance = maxDistance;
this.contentdom = contentdom;
this.timezoneOffset = timezoneOffset;
this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage);
this.offset = Math.max(0, Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset));
try {
@ -527,19 +529,19 @@ public final class QueryParams {
if (this.solrSchema.contains(CollectionSchema.dates_in_content_dts)) {
if (this.modifier.on != null && this.modifier.on.length() > 0) {
fqs.add(QueryModifier.parseOnExpression(this.modifier.on));
fqs.add(QueryModifier.parseOnExpression(this.modifier.on, this.timezoneOffset));
}
if (this.modifier.from != null && this.modifier.from.length() > 0 && (this.modifier.to == null || this.modifier.to.equals("*"))) {
fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, null));
fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, null, this.timezoneOffset));
}
if ((this.modifier.from == null || this.modifier.from.equals("*")) && this.modifier.to != null && this.modifier.to.length() > 0) {
fqs.add(QueryModifier.parseFromToExpression(null, this.modifier.to));
fqs.add(QueryModifier.parseFromToExpression(null, this.modifier.to, this.timezoneOffset));
}
if (this.modifier.from != null && this.modifier.from.length() > 0 && this.modifier.to != null && this.modifier.to.length() > 0) {
fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, this.modifier.to));
fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, this.modifier.to, this.timezoneOffset));
}
}

@ -358,7 +358,8 @@ public final class HTTPDProxyHandler {
"",
cachedResponseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
0);
0,
sb.crawler.defaultProxyProfile.timezoneOffset());
final Response response = new Response(
request,
requestHeader,
@ -473,8 +474,8 @@ public final class HTTPDProxyHandler {
"",
responseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
0);
0,
sb.crawler.defaultProxyProfile.timezoneOffset());
// handle incoming cookies
handleIncomingCookies(responseHeader, host, ip);

Loading…
Cancel
Save