- removed dependencies in header framework:

moved http date methods from DateFormatter to HeaderFramework
  changed logging to log4j
- added ftp load access to MultiProtocolURI
- ensured termination of RSS feed iteration

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7067 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 80ba543d4c
commit 844f158686

@ -64,7 +64,7 @@ public class Load_RSS_p {
prop.put("url", url.toNormalform(true, false));
Response entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] resource = entry == null ? null : entry.getContent();
rss = resource == null ? null : RSSReader.parse(resource);
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (IOException e) {
Log.logException(e);
}

@ -34,10 +34,8 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.Latency;
//import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
//import de.anomic.http.server.ResponseContainer;
import de.anomic.http.server.ResponseHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;

@ -302,7 +302,7 @@ public final class HTTPDProxyHandler {
DigestURI url = null;
try {
url = HeaderFramework.getRequestURL(conProp);
url = new DigestURI(HeaderFramework.getRequestURL(conProp));
if (log.isFine()) log.logFine(reqID +" GET "+ url);
if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader);

@ -41,19 +41,20 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.Collator;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.TimeZone;
import java.util.TreeMap;
import java.util.Vector;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.cora.document.MultiProtocolURI;
public class HeaderFramework extends TreeMap<String, String> implements Map<String, String> {
@ -243,6 +244,55 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
if (othermap != null) this.putAll(othermap);
}
/** Date formatter/parser for standard compliant HTTP header dates (RFC 1123) */
private static final String PATTERN_RFC1123 = "EEE, dd MMM yyyy HH:mm:ss Z"; // with numeric time zone indicator as defined in RFC5322
private static final String PATTERN_RFC1036 = "EEEE, dd-MMM-yy HH:mm:ss zzz";
private static final String PATTERN_ANSIC = "EEE MMM d HH:mm:ss yyyy";
private static final SimpleDateFormat FORMAT_RFC1123 = new SimpleDateFormat(PATTERN_RFC1123, Locale.US);
private static final SimpleDateFormat FORMAT_RFC1036 = new SimpleDateFormat(PATTERN_RFC1036, Locale.US);
private static final SimpleDateFormat FORMAT_ANSIC = new SimpleDateFormat(PATTERN_ANSIC, Locale.US);
private static final TimeZone TZ_GMT = TimeZone.getTimeZone("GMT");
private static final Calendar CAL_GMT = Calendar.getInstance(TZ_GMT, Locale.US);
/**
* RFC 2616 requires that HTTP clients are able to parse all 3 different
* formats. All times MUST be in GMT/UTC, but ...
*/
private static final SimpleDateFormat[] FORMATS_HTTP = new SimpleDateFormat[] {
// RFC 1123/822 (Standard) "Mon, 12 Nov 2007 10:11:12 GMT"
FORMAT_RFC1123,
// RFC 1036/850 (old) "Monday, 12-Nov-07 10:11:12 GMT"
FORMAT_RFC1036,
// ANSI C asctime() "Mon Nov 12 10:11:12 2007"
FORMAT_ANSIC,
};
/** Initialization of static formats */
static {
// 2-digit dates are automatically parsed by SimpleDateFormat,
// we need to detect the real year by adding 1900 or 2000 to
// the year value starting with 1970
CAL_GMT.setTimeInMillis(0);
for (SimpleDateFormat format: FORMATS_HTTP) {
format.setTimeZone(TZ_GMT);
format.set2DigitYearStart(CAL_GMT.getTime());
}
}
/**
* Parse a HTTP string representation of a date into a Date instance.
* @param s The date String to parse.
* @return The Date instance if successful, <code>null</code> otherwise.
*/
public static Date parseHTTPDate(String s) {
s = s.trim();
if (s == null || s.length() < 9) return null;
for (SimpleDateFormat format: FORMATS_HTTP) synchronized (format) {
try { return format.parse(s); } catch (final ParseException e) {}
}
return null;
}
// we override the put method to make use of the reverseMappingCache
@Override
@ -386,7 +436,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
protected Date headerDate(final String kind) {
if (containsKey(kind)) {
Date parsedDate = DateFormatter.parseHTTPDate(get(kind));
Date parsedDate = parseHTTPDate(get(kind));
if (parsedDate == null) parsedDate = new Date();
return parsedDate;
}
@ -461,7 +511,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
theHeader.append("\r\n");
}
public static DigestURI getRequestURL(final Properties conProp) throws MalformedURLException {
public static MultiProtocolURI getRequestURL(final Properties conProp) throws MalformedURLException {
String host = conProp.getProperty(HeaderFramework.CONNECTION_PROP_HOST);
final String path = conProp.getProperty(HeaderFramework.CONNECTION_PROP_PATH); // always starts with leading '/'
final String args = conProp.getProperty(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given
@ -475,7 +525,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
host = host.substring(0, pos);
}
final DigestURI url = new DigestURI("http", host, port, (args == null) ? path : path + "?" + args);
final MultiProtocolURI url = new MultiProtocolURI("http", host, port, (args == null) ? path : path + "?" + args);
return url;
}

@ -31,7 +31,6 @@ import java.util.Properties;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.server.serverCore;
@ -106,7 +105,7 @@ public class RequestHeader extends HeaderFramework {
public Object ifRange() {
if (containsKey(IF_RANGE)) {
final Date rangeDate = DateFormatter.parseHTTPDate(get(IF_RANGE));
final Date rangeDate = parseHTTPDate(get(IF_RANGE));
if (rangeDate != null)
return rangeDate;

@ -31,8 +31,7 @@ import java.util.HashMap;
import java.util.Map;
import org.apache.http.Header;
import net.yacy.kelondro.logging.Log;
import org.apache.log4j.Logger;
public class ResponseHeader extends HeaderFramework {
@ -40,6 +39,7 @@ public class ResponseHeader extends HeaderFramework {
// response header properties
private static final long serialVersionUID = 0L;
private static Logger log = Logger.getLogger(ResponseHeader.class);
public ResponseHeader() {
super();
@ -133,16 +133,16 @@ public class ResponseHeader extends HeaderFramework {
// maybe the charset is valid but not installed on this computer
try {
if(!Charset.isSupported(charSetName)) {
Log.logWarning("httpHeader", "charset '"+ charSetName +"' is not supported on this machine, using default ("+ Charset.defaultCharset().name() +")");
log.warn("charset '"+ charSetName +"' is not supported on this machine, using default ("+ Charset.defaultCharset().name() +")");
// use system default
return Charset.defaultCharset();
}
} catch(IllegalCharsetNameException e) {
Log.logSevere("httpHeader", "Charset in header is illegal: '"+ charSetName +"'\n "+ toString() + "\n" + e.getMessage());
log.warn("Charset in header is illegal: '"+ charSetName +"'\n "+ toString() + "\n" + e.getMessage());
// use system default
return Charset.defaultCharset();
} catch (UnsupportedCharsetException e) {
Log.logSevere("httpHeader", "Charset in header is unsupported: '"+ charSetName +"'\n "+ toString() + "\n" + e.getMessage());
log.warn("Charset in header is unsupported: '"+ charSetName +"'\n "+ toString() + "\n" + e.getMessage());
// use system default
return Charset.defaultCharset();
}

@ -327,7 +327,7 @@ public final class yacyClient {
parts.put("count", new StringBody(Integer.toString(maxCount)));
parts.put("time", new StringBody(Long.toString(maxTime)));
final byte[] result = HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/urls.xml"), (int) maxTime, target.getHexHash() + ".yacyh", parts);
final RSSReader reader = RSSReader.parse(result);
final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
if (reader == null) {
yacyCore.log.logWarning("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (1), reader == null");
target.put(yacySeed.RCOUNT, "0");

@ -41,6 +41,7 @@ import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileInputStream;
import net.yacy.cora.document.Punycode.PunycodeException;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.kelondro.util.Domains;
@ -940,6 +941,13 @@ public class MultiProtocolURI implements Serializable {
public InputStream getInputStream(final String userAgent, final int timeout) throws IOException {
if (isFile()) return new FileInputStream(getFSFile());
if (isSMB()) return new SmbFileInputStream(getSmbFile());
if (isFTP()) {
FTPClient client = new FTPClient();
client.open(this.host, this.port < 0 ? 21 : this.port);
byte[] b = client.get(this.path);
client.CLOSE();
return new ByteArrayInputStream(b);
}
if (isHTTP() || isHTTPS()) {
final HTTPClient client = new HTTPClient();
client.setTimout(timeout);
@ -954,6 +962,13 @@ public class MultiProtocolURI implements Serializable {
public byte[] get(final String userAgent, final int timeout) throws IOException {
if (isFile()) return read(new FileInputStream(getFSFile()));
if (isSMB()) return read(new SmbFileInputStream(getSmbFile()));
if (isFTP()) {
FTPClient client = new FTPClient();
client.open(this.host, this.port < 0 ? 21 : this.port);
byte[] b = client.get(this.path);
client.CLOSE();
return b;
}
if (isHTTP() || isHTTPS()) {
final HTTPClient client = new HTTPClient();
client.setTimout(timeout);
@ -1002,6 +1017,7 @@ public class MultiProtocolURI implements Serializable {
new String[]{null, "http://www.anomic.de/home/test?x=1"},
new String[]{null, "http://www.anomic.de/home/test#home"},
new String[]{null, "ftp://ftp.anomic.de/home/test#home"},
new String[]{null, "ftp://bob:builder@ftp.anomic.de/home/test.gif"},
new String[]{null, "http://www.anomic.de/home/../abc/"},
new String[]{null, "mailto:abcdefg@nomailnomail.com"},
new String[]{"http://www.anomic.de/home", "test"},

@ -28,28 +28,20 @@ import java.util.Map;
public class RSSFeed implements Iterable<Hit> {
public static final int DEFAULT_MAXSIZE = 1000;
// class variables
private RSSMessage channel;
private String imageURL;
private Map<String, RSSMessage> messages; // a guid:Item map
private int maxsize;
public RSSFeed() {
messages = Collections.synchronizedMap(new LinkedHashMap<String, RSSMessage>());
channel = null;
maxsize = Integer.MAX_VALUE;
}
public RSSFeed(final int maxsize) {
this();
this.messages = Collections.synchronizedMap(new LinkedHashMap<String, RSSMessage>());
this.channel = null;
this.maxsize = maxsize;
}
public void setMaxsize(final int maxsize) {
this.maxsize = maxsize;
while (messages.size() > this.maxsize) pollMessage();
}
public void setChannel(final RSSMessage channelItem) {
this.channel = channelItem;
}
@ -69,6 +61,7 @@ public class RSSFeed implements Iterable<Hit> {
public void addMessage(final RSSMessage item) {
final String guid = item.getGuid();
messages.put(guid, item);
// in case that the feed is full (size > maxsize) flush the oldest element
while (messages.size() > this.maxsize) pollMessage();
}
@ -103,17 +96,21 @@ public class RSSFeed implements Iterable<Hit> {
Iterator<String> GUIDiterator;
String lastGUID;
int t;
public messageIterator() {
t = messages.size(); // termination counter
GUIDiterator = messages.keySet().iterator();
lastGUID = null;
}
public boolean hasNext() {
if (t <= 0) return false; // ensure termination
return GUIDiterator.hasNext();
}
public RSSMessage next() {
t--; // ensure termination
try {
lastGUID = GUIDiterator.next();
} catch (ConcurrentModificationException e) {

@ -27,6 +27,8 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.http.server.HeaderFramework;
import net.yacy.kelondro.util.DateFormatter;
public class RSSMessage implements Hit {
@ -150,7 +152,7 @@ public class RSSMessage implements Hit {
try {
date = DateFormatter.parseISO8601(dateString);
} catch (ParseException e1) {
date = DateFormatter.parseHTTPDate(dateString);
date = HeaderFramework.parseHTTPDate(dateString);
}
}
return date;

@ -41,8 +41,8 @@ public class RSSReader extends DefaultHandler {
private boolean parsingChannel, parsingImage, parsingItem;
private final RSSFeed theChannel;
public RSSReader() {
theChannel = new RSSFeed();
public RSSReader(int maxsize) {
theChannel = new RSSFeed(maxsize);
buffer = new StringBuilder();
item = null;
parsingChannel = false;
@ -50,8 +50,8 @@ public class RSSReader extends DefaultHandler {
parsingItem = false;
}
public RSSReader(final String path) throws IOException {
this();
public RSSReader(int maxsize, final String path) throws IOException {
this(maxsize);
final SAXParserFactory factory = SAXParserFactory.newInstance();
try {
final SAXParser saxParser = factory.newSAXParser();
@ -63,8 +63,8 @@ public class RSSReader extends DefaultHandler {
}
}
public RSSReader(final InputStream stream) throws IOException {
this();
public RSSReader(int maxsize, final InputStream stream) throws IOException {
this(maxsize);
final SAXParserFactory factory = SAXParserFactory.newInstance();
try {
final SAXParser saxParser = factory.newSAXParser();
@ -76,7 +76,7 @@ public class RSSReader extends DefaultHandler {
}
}
public static RSSReader parse(final byte[] a) throws IOException {
public static RSSReader parse(int maxsize, final byte[] a) throws IOException {
// check integrity of array
if ((a == null) || (a.length == 0)) {
@ -99,7 +99,7 @@ public class RSSReader extends DefaultHandler {
// parse stream
RSSReader reader = null;
try {
reader = new RSSReader(bais);
reader = new RSSReader(maxsize, bais);
} catch (final Exception e) {
throw new IOException("parse exception: " + e.getMessage(), e);
}

@ -1509,7 +1509,7 @@ public class FTPClient {
return true;
}
private void open(final String host, final int port) throws IOException {
public void open(final String host, final int port) throws IOException {
if (ControlSocket != null) {
exec("close", false); // close any existing connections first
}
@ -2678,8 +2678,7 @@ public class FTPClient {
}
}
public static void get(final String host, String remoteFile, final File localPath, final String account,
final String password) {
public static void get(final String host, String remoteFile, final File localPath, final String account, final String password) {
try {
final FTPClient c = new FTPClient();
if (remoteFile.length() == 0) {

@ -130,7 +130,7 @@ public class Search {
parts.put("resource", new StringBody(global ? "global" : "local"));
final byte[] result = HTTPConnector.getConnector(HTTPLoader.yacyUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
final RSSReader reader = RSSReader.parse(result);
final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
if (reader == null) {
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
}

@ -56,7 +56,7 @@ public class rssParser extends AbstractParser implements Parser {
public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
RSSReader rssReader;
try {
rssReader = new RSSReader(source);
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source);
} catch (IOException e) {
throw new Parser.Failure("Load error:" + e.getMessage(), url);
}

@ -73,23 +73,10 @@ public final class DateFormatter {
/** Date formatter/parser for standard compliant HTTP header dates (RFC 1123) */
private static final SimpleDateFormat FORMAT_RFC1123 = new SimpleDateFormat(PATTERN_RFC1123, Locale.US);
private static final SimpleDateFormat FORMAT_RFC1036 = new SimpleDateFormat(PATTERN_RFC1036, Locale.US);
private static final SimpleDateFormat FORMAT_ANSIC = new SimpleDateFormat(PATTERN_ANSIC, Locale.US);
private static final SimpleDateFormat FORMAT_RFC1123_SHORT = new SimpleDateFormat(PATTERN_RFC1123_SHORT, Locale.US);
/**
* RFC 2616 requires that HTTP clients are able to parse all 3 different
* formats. All times MUST be in GMT/UTC, but ...
*/
private static final SimpleDateFormat[] FORMATS_HTTP = new SimpleDateFormat[] {
// RFC 1123/822 (Standard) "Mon, 12 Nov 2007 10:11:12 GMT"
FORMAT_RFC1123,
// RFC 1036/850 (old) "Monday, 12-Nov-07 10:11:12 GMT"
FORMAT_RFC1036,
// ANSI C asctime() "Mon Nov 12 10:11:12 2007"
new SimpleDateFormat(PATTERN_ANSIC, Locale.US),
};
/** Initialization of static formats */
static {
@ -98,12 +85,6 @@ public final class DateFormatter {
// the year value starting with 1970
CAL_GMT.setTimeInMillis(0);
for (int i = 0; i < DateFormatter.FORMATS_HTTP.length; i++) {
final SimpleDateFormat f = DateFormatter.FORMATS_HTTP[i];
f.setTimeZone(TZ_GMT);
f.set2DigitYearStart(CAL_GMT.getTime());
}
// we want GMT times on the SHORT formats as well as they don't support any timezone
FORMAT_SHORT_DAY.setTimeZone(TZ_GMT);
FORMAT_SHORT_SECOND.setTimeZone(TZ_GMT);
@ -111,27 +92,6 @@ public final class DateFormatter {
FORMAT_ISO8601.setTimeZone(TZ_GMT);
}
/**
* Parse a HTTP string representation of a date into a Date instance.
* @param s The date String to parse.
* @return The Date instance if successful, <code>null</code> otherwise.
*/
public static Date parseHTTPDate(String s) {
s = s.trim();
if ((s == null) || (s.length() < 9)) return null;
for (int i = 0; i < FORMATS_HTTP.length; i++) {
try {
return parse(FORMATS_HTTP[i], s);
} catch (final ParseException e) {
// on ParseException try again with next parser
}
}
// the method didn't return a Date, so we got an illegal String
//serverLog.logSevere("HTTPC-header", "DATE ERROR (Parse): " + s);
return null;
}
/**
* Creates a String representation of a Date using the format defined

@ -2,7 +2,7 @@ package de.anomic.kelondro.util;
import java.util.Date;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.http.server.HeaderFramework;
import junit.framework.TestCase;
@ -12,7 +12,7 @@ public class DateFormatterTest extends TestCase {
* Test of httpHeader date parsing routine
*/
public void testParseHTTPDate() {
Date parsedDate = DateFormatter.parseHTTPDate("Tue, 08 Jul 2003 21:22:46 GMT");
Date parsedDate = HeaderFramework.parseHTTPDate("Tue, 08 Jul 2003 21:22:46 GMT");
// returned date must not be null
assertNotNull(parsedDate);

Loading…
Cancel
Save