From 39d0f10ca1dffbf088d681c8b708d41ea47c4aaa Mon Sep 17 00:00:00 2001 From: fuchsi Date: Fri, 30 Nov 2007 20:47:27 +0000 Subject: [PATCH] Fix parsing oof dates in HTTP headers. RFC 2616 requires a client to support RFC 1123 (default), RFC 1036 and ANSI C formatted date strings (we only supported 1123 before). Closes: http://forum.yacy-websuche.de/viewtopic.php?f=6&t=525 (and probably others). There are servers which break the standards, please report those "DATE ERROR" messages if they contain a "sane" date string. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4243 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/http/httpDate.java | 141 ++++++++++++++++++++++++++ source/de/anomic/http/httpHeader.java | 47 +-------- 2 files changed, 143 insertions(+), 45 deletions(-) create mode 100644 source/de/anomic/http/httpDate.java diff --git a/source/de/anomic/http/httpDate.java b/source/de/anomic/http/httpDate.java new file mode 100644 index 000000000..a95615d31 --- /dev/null +++ b/source/de/anomic/http/httpDate.java @@ -0,0 +1,141 @@ +// httpDate.java +// ------------------------------ +// part of YaCy +// (C) by Bjoern 'Fuchs' Krombholz; fox.box@gmail.com +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005, 2006 +// +// This Class was written by Martin Thelian +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.http; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; +import java.util.Locale; +import java.util.TimeZone; + +import de.anomic.server.logging.serverLog; + +/** + * Helper class for parsing HTTP Dates according to RFC 2616 * + */ +public final class httpDate { + + public static String PAT_DATE_RFC1123 = "EEE, dd MMM yyyy HH:mm:ss zzz"; + public static String PAT_DATE_RFC1036 = "EEEE, dd-MMM-yy HH:mm:ss zzz"; + public static String PAT_DATE_ANSI = "EEE MMM d HH:mm:ss yyyy"; + + /** + * RFC 2616 requires that HTTP clients are able to parse all 3 different + * formats. All times MUST be in GMT/UTC, but ... + */ + public static SimpleDateFormat[] DATE_PARSERS = new SimpleDateFormat[] { + // RFC 1123/822 (Standard) "Mon, 12 Nov 2007 10:11:12 GMT" + new SimpleDateFormat(PAT_DATE_RFC1123, Locale.US), + // RFC 1036/850 (old) "Monday, 12-Nov-07 10:11:12 GMT" + new SimpleDateFormat(PAT_DATE_RFC1036, Locale.US), + // ANSI C asctime() "Mon Nov 12 10:11:12 2007" + new SimpleDateFormat(PAT_DATE_ANSI, Locale.US), + }; + + static { + // 2-digit dates are automatically parsed by SimpleDateFormat, + // we need to detect the real year by adding 1900 or 2000 to + // the year value starting with 1990 (before there was no WWW) + Calendar c = Calendar.getInstance(TimeZone.getTimeZone("GMT")); + // 01 Jan 1990 00:00:00 + c.set(1990, 1, 1, 0, 0, 0); + + for (int i = 0; i < DATE_PARSERS.length; i++) { + SimpleDateFormat f = DATE_PARSERS[i]; + // is this necessary? + f.setTimeZone(TimeZone.getTimeZone("GMT")); + f.set2DigitYearStart(c.getTime()); + } + } + + private httpDate() {}; + + /** + * Parse a HTTP string representation of a date into a Date instance. + * @param s The date String to parse. + * @return The Date instance if successful, null otherwise. + */ + public static Date parseHTTPDate(String s) { + try { + return httpDate.parseHTTPDate(s, true); + } catch (ParseException e) { + serverLog.logSevere("HTTPC-header", "DATE ERROR (Parse): " + s); + return null; + } catch (java.lang.NumberFormatException e) { + serverLog.logSevere("HTTPC-header", "DATE ERROR (NumberFormat): " + s); + return null; + } + } + + /** + * Parse a HTTP string representation of a date into a Date instance. + * @param s The date String to parse. + * @param ignoreTimezone parse the timezone? Currently ignored, always parsed. + * @return The Date instance if successful, null otherwise. + * @throws ParseException Thrown, when a parsing problem occured (date String had no leagal format) + * @throws NumberFormatException + */ + public static Date parseHTTPDate(String s, boolean /*unused*/ ignoreTimezone) throws ParseException { + + if ((s == null) || (s.length() < 9)) return null; + s = s.trim(); + + //Why was this here? + //if (s.indexOf("Mrz") > 0) s = s.replaceAll("Mrz", "March"); + + ParseException pe = null; + for(int i = 0; i < DATE_PARSERS.length; i++) { + try { + // if parse() throws an Exception we try the next pattern + return DATE_PARSERS[i].parse(s); + } catch (ParseException e) { + // we re-throw the last Exception when parsing was not possible + pe = e; + } + } + + // no match + throw pe; + } +} diff --git a/source/de/anomic/http/httpHeader.java b/source/de/anomic/http/httpHeader.java index 78e7255f9..6869582fa 100644 --- a/source/de/anomic/http/httpHeader.java +++ b/source/de/anomic/http/httpHeader.java @@ -61,20 +61,16 @@ import java.io.FileReader; import java.io.IOException; import java.net.MalformedURLException; import java.text.Collator; -import java.text.ParseException; -import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Locale; import java.util.Map; import java.util.Properties; -import java.util.TimeZone; import java.util.TreeMap; import java.util.Vector; import de.anomic.server.serverCore; -import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyURL; @@ -388,48 +384,9 @@ public final class httpHeader extends TreeMap implements Map { Server=Apache/1.3.26 */ - //private static SimpleDateFormat HTTPGMTFormatter = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss 'GMT'"); - private static SimpleDateFormat EMLFormatter = new SimpleDateFormat("dd MMM yyyy HH:mm:ss", Locale.US); - - public static Date parseHTTPDate(String s) { - try { - return parseHTTPDate(s,true); - } catch (ParseException e) { - //System.out.println("ERROR long version parse: " + e.getMessage() + " at position " + e.getErrorOffset()); - serverLog.logSevere("HTTPC-header", "DATE ERROR (Parse): " + s); - return null; - } catch (java.lang.NumberFormatException e) { - //System.out.println("ERROR long version parse: " + e.getMessage() + " at position " + e.getErrorOffset()); - serverLog.logSevere("HTTPC-header", "DATE ERROR (NumberFormat): " + s); - return null; - } - } - - public static Date parseHTTPDate(String s,boolean ignoreTimezone) throws ParseException, NumberFormatException { - - SimpleDateFormat formatter = EMLFormatter; - if ((s == null) || (s.length() < 9)) return null; - s = s.trim(); - if (s.charAt(3) == ',') s = s.substring(5).trim(); // we skip the name of the day - if (s.charAt(9) == ' ') s = s.substring(0, 7) + "20" + s.substring(7); // short year version - if (s.charAt(2) == ',') s = s.substring(0, 2) + s.substring(3); // ommit comma after day of week - if ((s.charAt(0) > '9') && (s.length() > 20) && (s.charAt(2) == ' ')) s = s.substring(3); - if (s.length() > 20) { - if (!ignoreTimezone) { - formatter = (SimpleDateFormat) formatter.clone(); - formatter.setTimeZone(TimeZone.getTimeZone(s.substring(20))); - } - s = s.substring(0, 20).trim(); // truncate remaining, since that must be wrong - } - if (s.indexOf("Mrz") > 0) s = s.replaceAll("Mrz", "March"); - - // parsing the date string - return formatter.parse(s); - } - private Date headerDate(String kind) { if (containsKey(kind)) { - Date parsedDate = parseHTTPDate((String) get(kind)); + Date parsedDate = httpDate.parseHTTPDate((String) get(kind)); if (parsedDate == null) parsedDate = new Date(); return new Date(parsedDate.getTime()); } @@ -483,7 +440,7 @@ public final class httpHeader extends TreeMap implements Map { public Object ifRange() { if (containsKey(httpHeader.IF_RANGE)) { try { - Date rangeDate = parseHTTPDate((String) get(httpHeader.IF_RANGE),false); + Date rangeDate = httpDate.parseHTTPDate((String) get(httpHeader.IF_RANGE),false); if (rangeDate != null) return new Date(rangeDate.getTime()); } catch (Exception e) {} return get(httpHeader.IF_RANGE);