diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 0c6187917..b99874aad 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -62,6 +62,7 @@ import de.anomic.index.indexURLEntry; import de.anomic.net.URL; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverDate; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -258,7 +259,7 @@ public class Bookmarks { prop.put("bookmarks_"+count+"_link", de.anomic.data.wikiCode.replaceXMLEntities(bookmark.getUrl())); prop.put("bookmarks_"+count+"_title", bookmark.getTitle()); prop.put("bookmarks_"+count+"_description", bookmark.getDescription()); - prop.put("bookmarks_"+count+"_date", bookmarksDB.dateToiso8601(new Date(bookmark.getTimeStamp()))); + prop.put("bookmarks_"+count+"_date", serverDate.dateToiso8601(new Date(bookmark.getTimeStamp()))); prop.put("bookmarks_"+count+"_rfc822date", httpc.dateString(new Date(bookmark.getTimeStamp()))); prop.put("bookmarks_"+count+"_public", (bookmark.getPublic()? 1:0)); diff --git a/htroot/xml/bookmarks/posts/all.java b/htroot/xml/bookmarks/posts/all.java index 4e17805ca..d6c6a7cc4 100644 --- a/htroot/xml/bookmarks/posts/all.java +++ b/htroot/xml/bookmarks/posts/all.java @@ -52,6 +52,7 @@ import de.anomic.data.bookmarksDB; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCodings; +import de.anomic.server.serverDate; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -77,7 +78,7 @@ public class all { prop.putSafeXML("posts_"+count+"_description", bookmark.getDescription()); prop.putSafeXML("posts_"+count+"_md5", serverCodings.encodeMD5Hex(bookmark.getUrl())); date=new Date(bookmark.getTimeStamp()); - prop.putSafeXML("posts_"+count+"_time", bookmarksDB.dateToiso8601(date)); + prop.putSafeXML("posts_"+count+"_time", serverDate.dateToiso8601(date)); prop.putSafeXML("posts_"+count+"_tags", bookmark.getTagsString().replaceAll(","," ")); count++; } diff --git a/htroot/xml/bookmarks/posts/get.java b/htroot/xml/bookmarks/posts/get.java index 5c820daf8..bb3a0b8ae 100644 --- a/htroot/xml/bookmarks/posts/get.java +++ b/htroot/xml/bookmarks/posts/get.java @@ -19,6 +19,7 @@ package xml.bookmarks.posts; +import java.text.ParseException; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; @@ -27,6 +28,7 @@ import de.anomic.data.bookmarksDB; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCodings; +import de.anomic.server.serverDate; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -46,15 +48,23 @@ public class get { if(post != null && post.containsKey("date")){ date=(String)post.get("date"); }else{ - date=bookmarksDB.dateToiso8601(new Date(System.currentTimeMillis())); + date=serverDate.dateToiso8601(new Date(System.currentTimeMillis())); } int count=0; - ArrayList bookmark_hashes=switchboard.bookmarksDB.getDate(Long.toString(bookmarksDB.iso8601ToDate(date).getTime())).getBookmarkList(); + + Date parsedDate = null; + try { + parsedDate = serverDate.iso8601ToDate(date); + } catch (ParseException e) { + parsedDate = new Date(); + } + + ArrayList bookmark_hashes=switchboard.bookmarksDB.getDate(Long.toString(parsedDate.getTime())).getBookmarkList(); Iterator it=bookmark_hashes.iterator(); bookmarksDB.Bookmark bookmark=null; while(it.hasNext()){ bookmark=switchboard.bookmarksDB.getBookmark((String) it.next()); - if(bookmarksDB.dateToiso8601(new Date(bookmark.getTimeStamp())) == date && + if(serverDate.dateToiso8601(new Date(bookmark.getTimeStamp())) == date && tag==null || bookmark.getTags().contains(tag) && isAdmin || bookmark.getPublic()){ prop.putSafeXML("posts_"+count+"_url", bookmark.getUrl()); diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index 7641a8968..68176e19f 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -46,6 +46,7 @@ package de.anomic.data; import java.io.InputStream; import java.net.MalformedURLException; +import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; @@ -66,6 +67,7 @@ import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlZURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; +import de.anomic.server.serverDate; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyCore; @@ -102,10 +104,6 @@ public class SitemapParser extends DefaultHandler { public static final String SITEMAP_URL_CHANGEFREQ = "changefreq"; public static final String SITEMAP_URL_PRIORITY = "priority"; - // TODO: which local settings should we use here? - private final SimpleDateFormat dateFormater = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US); - - /** * The crawling profile used to parse the URLs contained in the sitemap file */ @@ -136,25 +134,45 @@ public class SitemapParser extends DefaultHandler { */ private int urlCounter = 0; + /** + * the logger + */ private serverLog logger = new serverLog("SITEMAP"); + /** + * The location of the sitemap file + */ private URL siteMapURL = null; + + /** + * The next URL to enqueue + */ private String nextURL = null; + + /** + * last modification date of the {@link #nextURL} + */ private Date lastMod = null; public SitemapParser(plasmaSwitchboard sb, URL sitemap, plasmaCrawlProfile.entry theCrawlingProfile) { if (sb == null) throw new NullPointerException("The switchboard must not be null"); + if (sitemap == null) throw new NullPointerException("The sitemap URL must not be null"); this.switchboard = sb; this.siteMapURL = sitemap; if (theCrawlingProfile == null) { + // create a new profile this.crawlingProfile = createProfile(this.siteMapURL.getHost(),this.siteMapURL.toString()); } else { + // use an existing profile this.crawlingProfile = theCrawlingProfile; } } + /** + * Function to download and parse the sitemap file + */ public void parse() { // download document httpc remote = null; @@ -168,7 +186,8 @@ public class SitemapParser extends DefaultHandler { httpc.response res = remote.GET(this.siteMapURL.getFile(), null); if (res.statusCode != 200) { - throw new Exception("Unable to download the sitemap file. Server returned status: " + res.status); + this.logger.logWarning("Unable to download the sitemap file " + this.siteMapURL + "\nServer returned status: " + res.status); + return; } // getting some metadata @@ -180,36 +199,45 @@ public class SitemapParser extends DefaultHandler { contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip") )) { + this.logger.logFine("Sitemap file has mimetype " + contentMimeType); contentStream = new GZIPInputStream(contentStream); } this.counterStream = new httpdByteCountInputStream(contentStream,null); // parse it + this.logger.logInfo("Start parsing sitemap file " + this.siteMapURL + + "\n\tMimeType: " + contentMimeType + + "\n\tLength: " + this.contentLength); SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser(); saxParser.parse(this.counterStream, this); } catch (Exception e) { - e.printStackTrace(); + this.logger.logWarning("Unable to parse sitemap file " + this.siteMapURL,e); } finally { - if (remote != null) try { httpc.returnInstance(remote); } catch (Exception e) {} + if (remote != null) try { httpc.returnInstance(remote); } catch (Exception e) {/* ignore this */} } } + /** + * @return the total length of the sitemap file in bytes or -1 if the length is unknown + */ public long getTotalLength() { return this.contentLength; } + /** + * @return the amount of bytes of the sitemap file that were downloaded so far + */ public long getProcessedLength() { return (this.counterStream==null)?0:this.counterStream.getCount(); } + /** + * @return the amount of URLs that were successfully enqueued so far + */ public long getUrlcount() { return this.urlCounter; - } - - public void startDocument() throws SAXException { - // TODO: create a new crawling profile - } + } /** * @param localName local name @@ -236,11 +264,13 @@ public class SitemapParser extends DefaultHandler { /** * @param localName local name * @param qName qualified name + * @throws SAXException * @see DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String) */ - public void endElement( String namespaceURI, + public void endElement( + String namespaceURI, String localName, - String qName ) { + String qName ) throws SAXException { this.currentElement = ""; if (qName.equalsIgnoreCase(SITEMAP_URL)) { @@ -277,18 +307,22 @@ public class SitemapParser extends DefaultHandler { this.crawlingProfile ); } catch (InterruptedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + throw new SAXException ("Parsing interrupted", e); } if (error != null) { try { + this.logger.logInfo("The URL '" + this.nextURL + "' can not be crawled. Reason: " + error); + + // insert URL into the error DB plasmaCrawlZURL.Entry ee = this.switchboard.errorURL.newEntry(new URL(this.nextURL), error); - } catch (MalformedURLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } + ee.store(); + this.switchboard.errorURL.stackPushEntry(ee); + } catch (MalformedURLException e) {/* ignore this */ } } else { + this.logger.logInfo("New URL '" + this.nextURL + "' added for crawling."); + + // count successfully added URLs this.urlCounter++; } } @@ -298,18 +332,18 @@ public class SitemapParser extends DefaultHandler { if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LOC)) { // TODO: we need to decode the URL here this.nextURL =(new String(buf,offset,len)).trim(); - if (!this.nextURL.startsWith("http")) { - System.out.println(this.nextURL); + if (!this.nextURL.startsWith("http") && !this.nextURL.startsWith("https")) { + this.logger.logInfo("The url '" + this.nextURL + "' has a wrong format. Ignore it."); + this.nextURL = null; } - } -// else if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LASTMOD)) { -// try { -// this.lastMod = this.dateFormater.parse(new String(buf,offset,len)); -// } catch (ParseException e) { -// // TODO Auto-generated catch block -// e.printStackTrace(); -// } -// } + } else if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LASTMOD)) { + String dateStr = new String(buf,offset,len); + try { + this.lastMod = serverDate.iso8601ToDate(dateStr); + } catch (ParseException e) { + this.logger.logInfo("Unable to parse datestring '" + dateStr + "'"); + } + } } private plasmaCrawlProfile.entry createProfile(String domainName, String sitemapURL) { diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index 86be6d8c1..7ef26e96b 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -51,7 +51,6 @@ import java.io.Writer; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; -import java.util.Calendar; import java.util.Comparator; import java.util.Date; import java.util.HashMap; @@ -81,6 +80,7 @@ import de.anomic.kelondro.kelondroMapObjects; import de.anomic.kelondro.kelondroObjects; import de.anomic.kelondro.kelondroObjectsMapEntry; import de.anomic.net.URL; +import de.anomic.server.serverDate; import de.anomic.server.serverFileUtils; import de.anomic.server.logging.serverLog; @@ -98,38 +98,6 @@ public class bookmarksDB { public static String tagHash(String tagName, String user){ return plasmaCondenser.word2hash(user+":"+tagName.toLowerCase()); } - public static String dateToiso8601(Date date){ - return new SimpleDateFormat("yyyy-MM-dd").format(date)+"T"+(new SimpleDateFormat("HH:mm:ss")).format(date)+"Z"; - } - public static Date iso8601ToDate(String iso8601){ - String[] tmp=iso8601.split("T"); - if(tmp.length!=2){ - //Error parsing Date - return new Date(); - } - String day=tmp[0]; - String time=tmp[1]; - if(time.length()>8){ - time=time.substring(0,8); - } - try { - Calendar date=Calendar.getInstance(); - Calendar date2=Calendar.getInstance(); - date.setTime(new SimpleDateFormat("yyyy-MM-dd").parse(day)); - date2.setTime(new SimpleDateFormat("HH:mm:ss").parse(time)); - - date.set(Calendar.HOUR_OF_DAY, date2.get(Calendar.HOUR_OF_DAY)); - date.set(Calendar.MINUTE, date2.get(Calendar.MINUTE)); - date.set(Calendar.SECOND, date2.get(Calendar.SECOND)); - - return date.getTime(); - } catch (ParseException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return new Date(); - } - public bookmarksDB(File bookmarksFile, File tagsFile, File datesFile, long preloadTime) { // bookmarks tagCache=new HashMap(); @@ -534,7 +502,14 @@ public class bookmarksDB { } bm.setTags(tags, true); if(time != null){ - bm.setTimeStamp(iso8601ToDate(time).getTime()); + + Date parsedDate = null; + try { + parsedDate = serverDate.iso8601ToDate(time); + } catch (ParseException e) { + parsedDate = new Date(); + } + bm.setTimeStamp(parsedDate.getTime()); } if(description!=null){ bm.setProperty(Bookmark.BOOKMARK_DESCRIPTION, description); diff --git a/source/de/anomic/server/serverDate.java b/source/de/anomic/server/serverDate.java index e5bf581cd..8df13acc4 100644 --- a/source/de/anomic/server/serverDate.java +++ b/source/de/anomic/server/serverDate.java @@ -43,6 +43,7 @@ package de.anomic.server; +import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; @@ -266,7 +267,35 @@ public final class serverDate { return new String(result); } - // the following is only here to compare the kelondroDate with java-Date: + public static Date iso8601ToDate(String iso8601) throws ParseException{ + String[] tmp=iso8601.split("T"); + if(tmp.length!=2){ + //Error parsing Date + return new Date(); + } + String day=tmp[0]; + String time=tmp[1]; + if(time.length()>8){ + time=time.substring(0,8); + } + + Calendar date=Calendar.getInstance(); + Calendar date2=Calendar.getInstance(); + date.setTime(new SimpleDateFormat("yyyy-MM-dd").parse(day)); + date2.setTime(new SimpleDateFormat("HH:mm:ss").parse(time)); + + date.set(Calendar.HOUR_OF_DAY, date2.get(Calendar.HOUR_OF_DAY)); + date.set(Calendar.MINUTE, date2.get(Calendar.MINUTE)); + date.set(Calendar.SECOND, date2.get(Calendar.SECOND)); + + return date.getTime(); + } + + public static String dateToiso8601(Date date){ + return new SimpleDateFormat("yyyy-MM-dd").format(date)+"T"+(new SimpleDateFormat("HH:mm:ss")).format(date)+"Z"; + } + + // the following is only here to compare the kelondroDate with java-Date: private static TimeZone GMTTimeZone = TimeZone.getTimeZone("GMT"); private static Calendar gregorian = new GregorianCalendar(GMTTimeZone); private static SimpleDateFormat testSFormatter = new SimpleDateFormat("yyyyMMddHHmmss");