diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index cd61924b8..65b662f30 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -179,9 +179,9 @@ public class Bookmarks { prop.put("mode_edit", "0"); // create mode prop.put("mode_url", comp.url().toNormalform(false, true)); prop.putHTML("mode_title", comp.title()); - prop.putHTML("mode_description", (document == null) ? comp.title(): document.getTitle()); + prop.putHTML("mode_description", (document == null) ? comp.title(): document.dc_title()); prop.putHTML("mode_author", comp.author()); - prop.putHTML("mode_tags", (document == null) ? comp.tags() : document.getKeywords(',')); + prop.putHTML("mode_tags", (document == null) ? comp.tags() : document.dc_subject(',')); prop.put("mode_public", "0"); prop.put("mode_feed", "0"); //TODO: check if it IS a feed } diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index c660c3c49..14a708017 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -297,7 +297,7 @@ public class ViewFile { } } - resMime = document.getMimeType(); + resMime = document.dc_format(); String[] wordArray = wordArray(post.get("words", null)); if (viewMode.equals("parsed")) { @@ -310,7 +310,7 @@ public class ViewFile { } else if (viewMode.equals("sentences")) { prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES); - final Iterator sentences = document.getSentences(pre); + final Iterator sentences = document.getSentences(pre); boolean dark = true; int i = 0; @@ -319,7 +319,7 @@ public class ViewFile { // Search word highlighting while (sentences.hasNext()) { - sentence = ((StringBuffer) sentences.next()).toString(); + sentence = sentences.next().toString(); if (sentence.trim().length() > 0) { prop.put("viewMode_sentences_" + i + "_nr", i + 1); prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, sentence)); @@ -339,11 +339,11 @@ public class ViewFile { i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0)); dark = (i % 2 == 0); - TreeSet ts = document.getImages(); - Iterator tsi = ts.iterator(); + TreeSet ts = document.getImages(); + Iterator tsi = ts.iterator(); htmlFilterImageEntry entry; while (tsi.hasNext()) { - entry = (htmlFilterImageEntry) tsi.next(); + entry = tsi.next(); prop.put("viewMode_links_" + i + "_nr", i); prop.put("viewMode_links_" + i + "_dark", dark ? "1" : "0"); prop.put("viewMode_links_" + i + "_type", "image"); @@ -399,9 +399,9 @@ public class ViewFile { return message; } - private static int putMediaInfo(serverObjects prop, String[] wordArray, int c, Map media, String name, boolean dark) { - Iterator> mi = media.entrySet().iterator(); - Map.Entry entry; + private static int putMediaInfo(serverObjects prop, String[] wordArray, int c, Map media, String name, boolean dark) { + Iterator> mi = media.entrySet().iterator(); + Map.Entry entry; int i = 0; while (mi.hasNext()) { entry = mi.next(); @@ -409,8 +409,8 @@ public class ViewFile { prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0)); prop.putHTML("viewMode_links_" + c + "_type", name); prop.put("viewMode_links_" + c + "_text", markup(wordArray, (String) entry.getValue())); - prop.put("viewMode_links_" + c + "_link", markup(wordArray, (String) entry.getKey())); - prop.put("viewMode_links_" + c + "_url", entry.getKey()); + prop.put("viewMode_links_" + c + "_link", markup(wordArray, entry.getKey().toNormalform(true, false))); + prop.put("viewMode_links_" + c + "_url", entry.getKey().toNormalform(true, false)); prop.putHTML("viewMode_links_" + c + "_attr", ""); dark = !dark; c++; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index db9f768d0..bb4e1ac71 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -241,9 +241,9 @@ public class yacysearch { HashMap map = new HashMap(); map.put("url", comp.url().toNormalform(false, true).replace(',', '|')); map.put("title", comp.title().replace(',', ' ')); - map.put("description", ((document == null) ? comp.title() : document.getTitle()).replace(',', ' ')); - map.put("author", ((document == null) ? "" : document.getAuthor())); - map.put("tags", ((document == null) ? "" : document.getKeywords(' '))); + map.put("description", ((document == null) ? comp.title() : document.dc_title()).replace(',', ' ')); + map.put("author", ((document == null) ? "" : document.dc_creator())); + map.put("tags", ((document == null) ? "" : document.dc_subject(' '))); yacyCore.newsPool.publishMyNews(yacyNewsRecord.newRecord(yacyNewsPool.CATEGORY_SURFTIPP_ADD, map)); document.close(); } diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index b01724e06..048f602a8 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -230,13 +230,11 @@ public class yacysearchitem { ArrayList images = result.mediaSnippets(); if (images != null) { plasmaSnippetCache.MediaSnippet ms; - yacyURL url; int c = 0; for (int i = 0; i < images.size(); i++) { ms = (plasmaSnippetCache.MediaSnippet) images.get(i); - try {url = new yacyURL(ms.href, null);} catch (MalformedURLException e) {continue;} - prop.putHTML("content_items_" + i + "_href", ms.href); - prop.put("content_items_" + i + "_code", sb.licensedURLs.aquireLicense(url)); + prop.putHTML("content_items_" + i + "_href", ms.href.toNormalform(true, false)); + prop.put("content_items_" + i + "_code", sb.licensedURLs.aquireLicense(ms.href)); prop.putHTML("content_items_" + i + "_name", shorten(ms.name, namelength)); prop.put("content_items_" + i + "_attr", ms.attr); // attributes, here: original size of image c++; @@ -260,8 +258,8 @@ public class yacysearchitem { int c = 0; for (int i = 0; i < media.size(); i++) { ms = (plasmaSnippetCache.MediaSnippet) media.get(i); - prop.putHTML("content_items_" + i + "_href", ms.href); - prop.putHTML("content_items_" + i + "_hrefshort", nxTools.shortenURLString(ms.href, urllength)); + prop.putHTML("content_items_" + i + "_href", ms.href.toNormalform(true, false)); + prop.putHTML("content_items_" + i + "_hrefshort", nxTools.shortenURLString(ms.href.toNormalform(true, false), urllength)); prop.putHTML("content_items_" + i + "_name", shorten(ms.name, namelength)); prop.put("content_items_" + i + "_col", (col) ? "0" : "1"); c++; diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 6c702f241..64391837c 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -100,12 +100,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } // class variables: collectors for links - private HashMap anchors; + private HashMap anchors; private TreeSet images; // String(absolute url)/ImageEntry relation private HashMap metas; private String title; //private String headline; - private List[] headlines; + private List[] headlines; private serverCharBuffer content; private EventListenerList htmlFilterEventListeners = new EventListenerList(); @@ -119,12 +119,13 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen */ private yacyURL root; + @SuppressWarnings("unchecked") public htmlFilterContentScraper(yacyURL root) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); this.root = root; - this.anchors = new HashMap(); + this.anchors = new HashMap(); this.images = new TreeSet(); this.metas = new HashMap(); this.title = ""; @@ -159,11 +160,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return normalizedURL.toLowerCase().split(splitrex); // word components of the url } - private String absolutePath(String relativePath) { + private yacyURL absolutePath(String relativePath) { try { - return yacyURL.newURL(root, relativePath).toNormalform(false, true); + return yacyURL.newURL(root, relativePath); } catch (Exception e) { - return ""; + return null; } } @@ -174,11 +175,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen width = Integer.parseInt(tagopts.getProperty("width", "-1")); height = Integer.parseInt(tagopts.getProperty("height", "-1")); } catch (NumberFormatException e) {} - try { - yacyURL url = new yacyURL(absolutePath(tagopts.getProperty("src", "")), null); - htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height); - images.add(ie); - } catch (MalformedURLException e) {} + yacyURL url = absolutePath(tagopts.getProperty("src", "")); + htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height); + images.add(ie); } if (tagname.equalsIgnoreCase("base")) try { root = new yacyURL(tagopts.getProperty("href", ""), null); @@ -204,10 +203,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (href.length() > 0) anchors.put(absolutePath(href), areatitle); } if (tagname.equalsIgnoreCase("link")) { - yacyURL newLink = null; - try { - newLink = new yacyURL(absolutePath(tagopts.getProperty("href", "")), null); - } catch (MalformedURLException e) {} + yacyURL newLink = absolutePath(tagopts.getProperty("href", "")); if (newLink != null) { String type = tagopts.getProperty("rel", ""); @@ -218,7 +214,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen images.add(ie); this.favicon = newLink; } else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) { - anchors.put(newLink.toString(), linktitle); + anchors.put(newLink, linktitle); } } } @@ -346,7 +342,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } } - public Map getAnchors() { + public Map getAnchors() { // returns a url (String) / name (String) relation return anchors; } @@ -367,31 +363,44 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return this.favicon; } + /* + DC in html example: + + + + + + */ + public String getDescription() { - String s = (String) metas.get("description"); + String s = metas.get("description"); + if (s == null) s = metas.get("DC.description"); if (s == null) return ""; else return s; } public String getContentType() { - String s = (String) metas.get("content-type"); + String s = metas.get("content-type"); if (s == null) return ""; else return s; } public String getAuthor() { - String s = (String) metas.get("author"); - if (s == null) s = (String) metas.get("copyright"); + String s = metas.get("author"); + if (s == null) s = metas.get("copyright"); + if (s == null) s = metas.get("DC.creator"); if (s == null) return ""; return s; } public String[] getContentLanguages() { - String s = (String) metas.get("content-language"); + String s = metas.get("content-language"); + if (s == null) s = metas.get("DC.language"); if (s == null) s = ""; return s.split(" |,"); } public String[] getKeywords() { - String s = (String) metas.get("keywords"); + String s = metas.get("keywords"); + if (s == null) s = metas.get("DC.description"); if (s == null) s = ""; if (s.length() == 0) { return getTitle().toLowerCase().split(splitrex); @@ -499,4 +508,5 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return scraper; } -} \ No newline at end of file +} + diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java index 5b0ec0aad..8377df1ca 100644 --- a/source/de/anomic/plasma/parser/Parser.java +++ b/source/de/anomic/plasma/parser/Parser.java @@ -110,7 +110,7 @@ public interface Parser { * @return a {@link Hashtable} containing a list of MimeTypes that are supported by * the parser */ - public Hashtable getSupportedMimeTypes(); + public Hashtable getSupportedMimeTypes(); /** * This function should be called before reusing the parser object. diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index ad8606df6..82a693f4a 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -91,7 +91,7 @@ public class odtParser extends AbstractParser implements Parser { this.parserName = "OASIS OpenDocument V2 Text Document Parser"; } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @@ -108,7 +108,7 @@ public class odtParser extends AbstractParser implements Parser { // opening the file as zip file ZipFile zipFile= new ZipFile(dest); - Enumeration zipEnum = zipFile.entries(); + Enumeration zipEnum = zipFile.entries(); // looping through all containing files while (zipEnum.hasMoreElements()) { diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 98c212e14..21f965b1d 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -86,7 +86,7 @@ public class pdfParser extends AbstractParser implements Parser { this.parserName = "Acrobat Portable Document Parser"; } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/plasma/parser/ppt/pptParser.java b/source/de/anomic/plasma/parser/ppt/pptParser.java index 1df0fd873..84c2827fd 100644 --- a/source/de/anomic/plasma/parser/ppt/pptParser.java +++ b/source/de/anomic/plasma/parser/ppt/pptParser.java @@ -132,7 +132,7 @@ public class pptParser extends AbstractParser implements Parser { } } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/plasma/parser/ps/psParser.java b/source/de/anomic/plasma/parser/ps/psParser.java index 0ecbf3ce4..ddd8fccac 100644 --- a/source/de/anomic/plasma/parser/ps/psParser.java +++ b/source/de/anomic/plasma/parser/ps/psParser.java @@ -91,7 +91,7 @@ public class psParser extends AbstractParser implements Parser { } } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index b33f65012..2ed2c643b 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -88,7 +88,7 @@ public class rpmParser extends AbstractParser implements Parser { this.parserName = "rpm Parser"; } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @@ -110,7 +110,7 @@ public class rpmParser extends AbstractParser implements Parser { RPMFile rpmFile = null; try { String summary = null, description = null, packager = null, name = sourceFile.getName(); - HashMap anchors = new HashMap(); + HashMap anchors = new HashMap(); StringBuffer content = new StringBuffer(); // opening the rpm file @@ -138,7 +138,7 @@ public class rpmParser extends AbstractParser implements Parser { else if (headerNames[i].equalsIgnoreCase("SUMMARY")) summary = tag.toString(); else if (headerNames[i].equalsIgnoreCase("DESCRIPTION")) description = tag.toString(); else if (headerNames[i].equalsIgnoreCase("PACKAGER")) packager = tag.toString(); - else if (headerNames[i].equalsIgnoreCase("URL")) anchors.put(tag.toString(), tag.toString()); + else if (headerNames[i].equalsIgnoreCase("URL")) anchors.put(new yacyURL(tag.toString(), null), tag.toString()); } // closing the rpm file diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index c8595b3ca..c14459135 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -96,7 +96,7 @@ public class rssParser extends AbstractParser implements Parser { try { LinkedList feedSections = new LinkedList(); - HashMap anchors = new HashMap(); + HashMap anchors = new HashMap(); TreeSet images = new TreeSet(); serverByteBuffer text = new serverByteBuffer(); serverCharBuffer authors = new serverCharBuffer(); @@ -132,7 +132,7 @@ public class rssParser extends AbstractParser implements Parser { if (itemCreator != null && itemCreator.length() > 0) authors.append(",").append(itemCreator); feedSections.add(itemTitle); - anchors.put(itemURL.toString(),itemTitle); + anchors.put(itemURL, itemTitle); if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32); text.append(new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))).trim().toString()).append(' '); @@ -149,7 +149,7 @@ public class rssParser extends AbstractParser implements Parser { feedSections.add(itemHeadline); } - Map itemLinks = scraper.getAnchors(); + Map itemLinks = scraper.getAnchors(); if ((itemLinks != null) && (itemLinks.size() > 0)) { anchors.putAll(itemLinks); } @@ -191,7 +191,7 @@ public class rssParser extends AbstractParser implements Parser { } } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index ed8a1128f..c201212ab 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -116,7 +116,7 @@ public class rtfParser extends AbstractParser implements Parser { } } - public java.util.Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return rtfParser.SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java index 2a254a7fe..23e48163b 100644 --- a/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java +++ b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java @@ -48,6 +48,9 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import SevenZip.ArchiveExtractCallback; +import SevenZip.Archive.IInArchive; +import SevenZip.Archive.SevenZipEntry; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; @@ -56,10 +59,6 @@ import de.anomic.server.serverCachedFileOutputStream; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyURL; -import SevenZip.ArchiveExtractCallback; -import SevenZip.Archive.IInArchive; -import SevenZip.Archive.SevenZipEntry; - // wrapper class to redirect output of standard ArchiveExtractCallback to serverLog // and parse the extracted content public class SZParserExtractCallback extends ArchiveExtractCallback { @@ -117,7 +116,7 @@ public class SZParserExtractCallback extends ArchiveExtractCallback { plasmaParserDocument theDoc; // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects - yacyURL url = yacyURL.newURL(doc.getLocation(), this.prefix + "/" + super.filePath); + yacyURL url = yacyURL.newURL(doc.dc_source(), this.prefix + "/" + super.filePath); String mime = plasmaParser.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); if (this.cfos.isFallback()) { theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentFile()); @@ -126,18 +125,20 @@ public class SZParserExtractCallback extends ArchiveExtractCallback { } // revert the above workaround - Map nanchors = new HashMap(theDoc.getAnchors().size(), 1f); - Iterator it = theDoc.getAnchors().entrySet().iterator(); - Map.Entry entry; - String base = doc.getLocation().toNormalform(false, true); + Map nanchors = new HashMap(theDoc.getAnchors().size(), 1f); + Iterator> it = theDoc.getAnchors().entrySet().iterator(); + Map.Entry entry; + String base = doc.dc_source().toNormalform(false, true); + String u; while (it.hasNext()) { - entry = (Map.Entry)it.next(); - if (((String)entry.getKey()).startsWith(base + "/")) { - String ref = "#" + ((String)entry.getKey()).substring(base.length() + 1); + entry = it.next(); + u = entry.getKey().toNormalform(true, true); + if (u.startsWith(base + "/")) { + String ref = "#" + u.substring(base.length() + 1); this.log.logFinest("changing " + entry.getKey() + " to use reference " + ref); - nanchors.put(base + ref, (String)entry.getValue()); + nanchors.put(new yacyURL(base + ref, null), entry.getValue()); } else { - nanchors.put((String)entry.getKey(), (String)entry.getValue()); + nanchors.put(entry.getKey(), entry.getValue()); } } theDoc.getAnchors().clear(); diff --git a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java index 69d60e6da..43333a61d 100644 --- a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java +++ b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java @@ -50,7 +50,6 @@ import java.util.Hashtable; import SevenZip.IInStream; import SevenZip.MyRandomAccessFile; import SevenZip.Archive.SevenZip.Handler; - import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; @@ -140,7 +139,7 @@ public class sevenzipParser extends AbstractParser implements Parser { } } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } } diff --git a/source/de/anomic/plasma/parser/swf/swfParser.java b/source/de/anomic/plasma/parser/swf/swfParser.java index 32645f946..7d64ef4e1 100644 --- a/source/de/anomic/plasma/parser/swf/swfParser.java +++ b/source/de/anomic/plasma/parser/swf/swfParser.java @@ -44,11 +44,10 @@ package de.anomic.plasma.parser.swf; import java.io.InputStream; -import java.util.Hashtable; import java.util.HashMap; +import java.util.Hashtable; -import pt.tumba.parser.swf.*; - +import pt.tumba.parser.swf.SWF2HTML; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; @@ -82,7 +81,7 @@ public class swfParser extends AbstractParser implements Parser { /** * returns a hashtable containing the mimetypes that are supported by this class */ - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @@ -101,7 +100,7 @@ public class swfParser extends AbstractParser implements Parser { String[] sections = null; String abstrct = null; //TreeSet images = null; - HashMap anchors = new HashMap(); + HashMap anchors = new HashMap(); int urls = 0; int urlStart = -1; int urlEnd = 0; @@ -118,7 +117,7 @@ public class swfParser extends AbstractParser implements Parser { urlEnd = contents.indexOf(linebreak,urlStart); url = contents.substring(urlStart,urlEnd); urlnr = (new Integer(++urls)).toString(); - anchors.put(url,urlnr); + anchors.put(new yacyURL(url, null), urlnr); contents = contents.substring(0,urlStart)+contents.substring(urlEnd); } diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index 08df662a6..93f7b36cd 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -94,7 +94,7 @@ public class tarParser extends AbstractParser implements Parser { this.parserName = "Tape Archive File Parser"; } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @@ -131,7 +131,7 @@ public class tarParser extends AbstractParser implements Parser { LinkedList docSections = new LinkedList(); StringBuffer docAbstrct = new StringBuffer(); - Map docAnchors = new HashMap(); + Map docAnchors = new HashMap(); TreeSet docImages = new TreeSet(); // looping through the contained files @@ -177,15 +177,15 @@ public class tarParser extends AbstractParser implements Parser { // merging all documents together if (docKeywords.length() > 0) docKeywords.append(","); - docKeywords.append(subDoc.getKeywords(',')); + docKeywords.append(subDoc.dc_subject(',')); if (docLongTitle.length() > 0) docLongTitle.append("\n"); - docLongTitle.append(subDoc.getTitle()); + docLongTitle.append(subDoc.dc_title()); docSections.addAll(Arrays.asList(subDoc.getSectionTitles())); if (docAbstrct.length() > 0) docAbstrct.append("\n"); - docAbstrct.append(subDoc.getAbstract()); + docAbstrct.append(subDoc.dc_description()); if (subDoc.getTextLength() > 0) { if (docTextLength > 0) docText.write('\n'); diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index 5899588e3..44752e311 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -101,7 +101,7 @@ public class vcfParser extends AbstractParser implements Parser { StringBuffer parsedTitle = new StringBuffer(); StringBuffer parsedDataText = new StringBuffer(); HashMap parsedData = new HashMap(); - HashMap anchors = new HashMap(); + HashMap anchors = new HashMap(); LinkedList parsedNames = new LinkedList(); boolean useLastLine = false; @@ -211,7 +211,7 @@ public class vcfParser extends AbstractParser implements Parser { } else if (key.toUpperCase().startsWith("URL")) { try { yacyURL newURL = new yacyURL(value, null); - anchors.put(newURL.toString(),newURL.toString()); + anchors.put(newURL, newURL.toString()); //parsedData.put(key,value); } catch (MalformedURLException ex) {/* ignore this */} } else if ( diff --git a/source/de/anomic/plasma/parser/xls/xlsParser.java b/source/de/anomic/plasma/parser/xls/xlsParser.java index 010b1d2bd..9689d34ea 100644 --- a/source/de/anomic/plasma/parser/xls/xlsParser.java +++ b/source/de/anomic/plasma/parser/xls/xlsParser.java @@ -162,7 +162,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener { } } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index d3fcfb798..2b48249ed 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -43,7 +43,6 @@ package de.anomic.plasma.parser.zip; -import de.anomic.htmlFilter.htmlFilterImageEntry; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; @@ -58,6 +57,7 @@ import java.util.TreeSet; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; @@ -114,7 +114,7 @@ public class zipParser extends AbstractParser implements Parser { StringBuffer docLongTitle = new StringBuffer(); LinkedList docSections = new LinkedList(); StringBuffer docAbstrct = new StringBuffer(); - Map docAnchors = new HashMap(); + Map docAnchors = new HashMap(); TreeSet docImages = new TreeSet(); // creating a new parser class to parse the unzipped content @@ -160,15 +160,15 @@ public class zipParser extends AbstractParser implements Parser { // merging all documents together if (docKeywords.length() > 0) docKeywords.append(","); - docKeywords.append(subDoc.getKeywords(',')); + docKeywords.append(subDoc.dc_subject(',')); if (docLongTitle.length() > 0) docLongTitle.append("\n"); - docLongTitle.append(subDoc.getTitle()); + docLongTitle.append(subDoc.dc_title()); docSections.addAll(Arrays.asList(subDoc.getSectionTitles())); if (docAbstrct.length() > 0) docAbstrct.append("\n"); - docAbstrct.append(subDoc.getAbstract()); + docAbstrct.append(subDoc.dc_description()); if (subDoc.getTextLength() > 0) { if (docTextLength > 0) docText.write('\n'); diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 6dae035c8..5b0a80cc5 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -73,6 +73,7 @@ import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.server.serverCodings; import de.anomic.yacy.yacySeedDB; +import de.anomic.yacy.yacyURL; public final class plasmaCondenser { @@ -130,9 +131,9 @@ public final class plasmaCondenser { //System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia)); - insertTextToWords(document.getLocation().toNormalform(false, true), 0, indexRWIEntry.flag_app_url, RESULT_FLAGS); + insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_url, RESULT_FLAGS); - Map.Entry entry; + Map.Entry entry; if (indexText) { createCondensement(document.getText(), document.getCharset()); // the phrase counter: @@ -147,9 +148,9 @@ public final class plasmaCondenser { // phrase 99 is taken from the media Link url and anchor description // phrase 100 and above are lines from the text - insertTextToWords(document.getTitle(), 1, indexRWIEntry.flag_app_descr, RESULT_FLAGS); - insertTextToWords(document.getAbstract(), 3, indexRWIEntry.flag_app_descr, RESULT_FLAGS); - insertTextToWords(document.getAuthor(), 4, indexRWIEntry.flag_app_descr, RESULT_FLAGS); + insertTextToWords(document.dc_title(), 1, indexRWIEntry.flag_app_descr, RESULT_FLAGS); + insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_descr, RESULT_FLAGS); + insertTextToWords(document.dc_creator(), 4, indexRWIEntry.flag_app_descr, RESULT_FLAGS); // missing: tags! String[] titles = document.getSectionTitles(); for (int i = 0; i < titles.length; i++) { @@ -157,10 +158,10 @@ public final class plasmaCondenser { } // anchors - Iterator i = document.getAnchors().entrySet().iterator(); + Iterator> i = document.getAnchors().entrySet().iterator(); while (i.hasNext()) { - entry = (Map.Entry) i.next(); - insertTextToWords((String) entry.getKey(), 98, indexRWIEntry.flag_app_reference, RESULT_FLAGS); + entry = i.next(); + insertTextToWords(entry.getKey().toNormalform(false, false), 98, indexRWIEntry.flag_app_reference, RESULT_FLAGS); insertTextToWords((String) entry.getValue(), 98, indexRWIEntry.flag_app_reference, RESULT_FLAGS); } } else { @@ -172,45 +173,45 @@ public final class plasmaCondenser { if (indexMedia) { // audio - Iterator i = document.getAudiolinks().entrySet().iterator(); + Iterator> i = document.getAudiolinks().entrySet().iterator(); while (i.hasNext()) { - entry = (Map.Entry) i.next(); - insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, RESULT_FLAGS); + entry = i.next(); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS); insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS); } // video i = document.getVideolinks().entrySet().iterator(); while (i.hasNext()) { - entry = (Map.Entry) i.next(); - insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, RESULT_FLAGS); + entry = i.next(); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS); insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS); } // applications i = document.getApplinks().entrySet().iterator(); while (i.hasNext()) { - entry = (Map.Entry) i.next(); - insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, RESULT_FLAGS); + entry = i.next(); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS); insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS); } // images - i = document.getImages().iterator(); + Iterator j = document.getImages().iterator(); htmlFilterImageEntry ientry; - while (i.hasNext()) { - ientry = (htmlFilterImageEntry) i.next(); - insertTextToWords(ientry.url().toNormalform(false, true), 99, flag_cat_hasimage, RESULT_FLAGS); + while (j.hasNext()) { + ientry = j.next(); + insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS); insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS); } // finally check all words for missing flag entry - Iterator> j = words.entrySet().iterator(); + Iterator> k = words.entrySet().iterator(); wordStatProp wprop; Map.Entry we; - while (j.hasNext()) { - we = j.next(); - wprop = (wordStatProp) we.getValue(); + while (k.hasNext()) { + we = k.next(); + wprop = we.getValue(); if (wprop.flags == null) { wprop.flags = (kelondroBitfield) RESULT_FLAGS.clone(); words.put(we.getKey(), wprop); @@ -305,19 +306,19 @@ public final class plasmaCondenser { public static class wordStatProp { // object carries statistics for words and sentences - public int count; // number of occurrences - public int posInText; // unique handle, is initialized with word position (excluding double occurring words) - public int posInPhrase; // position of word in phrase - public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100 - public HashSet hash; // a set of handles to all sentences where this word appears - public kelondroBitfield flags; // the flag bits for each word + public int count; // number of occurrences + public int posInText; // unique handle, is initialized with word position (excluding double occurring words) + public int posInPhrase; // position of word in phrase + public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100 + private HashSet hash; // a set of handles to all sentences where this word appears + public kelondroBitfield flags; // the flag bits for each word public wordStatProp(int handle, int pip, int nop) { this.count = 1; this.posInText = handle; this.posInPhrase = pip; this.numOfPhrase = nop; - this.hash = new HashSet(); + this.hash = new HashSet(); this.flags = null; } @@ -326,7 +327,7 @@ public final class plasmaCondenser { } public void check(int i) { - hash.add(Integer.toString(i)); + hash.add(new Integer(i)); } } @@ -334,14 +335,14 @@ public final class plasmaCondenser { public static class phraseStatProp { // object carries statistics for words and sentences - public int count; // number of occurrences - public int handle; // unique handle, is initialized with sentence counter - public HashSet hash; // + public int count; // number of occurrences + public int handle; // unique handle, is initialized with sentence counter + private HashSet hash; // public phraseStatProp(int handle) { this.count = 1; this.handle = handle; - this.hash = new HashSet(); + this.hash = new HashSet(); } public void inc() { @@ -349,7 +350,7 @@ public final class plasmaCondenser { } public void check(int i) { - hash.add(Integer.toString(i)); + hash.add(new Integer(i)); } } @@ -362,7 +363,7 @@ public final class plasmaCondenser { } private void createCondensement(InputStream is, String charset) throws UnsupportedEncodingException { - HashSet currsentwords = new HashSet(); + HashSet currsentwords = new HashSet(); StringBuffer sentence = new StringBuffer(100); String word = ""; String k; @@ -376,7 +377,6 @@ public final class plasmaCondenser { int allsentencecounter = 0; int idx; int wordInSentenceCounter = 1; - Iterator it, it1; boolean comb_indexof = false, last_last = false, last_index = false; RandomAccessFile fa; final boolean dumpWords = false; @@ -405,6 +405,7 @@ public final class plasmaCondenser { // distinguish punctuation and words wordlen = word.length(); + Iterator it; if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) { // store sentence if (sentence.length() > 0) { @@ -493,9 +494,9 @@ public final class plasmaCondenser { String[] s; int wc; Object o; - it = sentences.keySet().iterator(); - while (it.hasNext()) { - o = it.next(); + Iterator sit = sentences.keySet().iterator(); + while (sit.hasNext()) { + o = sit.next(); if (o != null) { sentence = (StringBuffer) o; wc = (sentence.length() - 1) / numlength; @@ -511,15 +512,15 @@ public final class plasmaCondenser { } } - Map.Entry entry; + Map.Entry entry; // we search for similar words and reorganize the corresponding sentences // a word is similar, if a shortened version is equal - it = words.entrySet().iterator(); // enumerates the keys in descending order - wordsearch: while (it.hasNext()) { - entry = (Map.Entry) it.next(); - word = (String) entry.getKey(); + Iterator> wi = words.entrySet().iterator(); // enumerates the keys in descending order + wordsearch: while (wi.hasNext()) { + entry = wi.next(); + word = entry.getKey(); wordlen = word.length(); - wsp = (wordStatProp) entry.getValue(); + wsp = entry.getValue(); for (int i = wordcut; i > 0; i--) { if (wordlen > i) { k = word.substring(0, wordlen - i); @@ -528,9 +529,9 @@ public final class plasmaCondenser { // corresponding links // in sentences that use this word wsp1 = (wordStatProp) words.get(k); - it1 = wsp.hash.iterator(); // we iterate over all sentences that refer to this word + Iterator it1 = wsp.hash.iterator(); // we iterate over all sentences that refer to this word while (it1.hasNext()) { - idx = Integer.parseInt((String) it1.next()); // number of a sentence + idx = it1.next().intValue(); // number of a sentence s = (String[]) orderedSentences[idx]; for (int j = 2; j < s.length; j++) { if (s[j].equals(intString(wsp.posInText, numlength))) @@ -542,7 +543,7 @@ public final class plasmaCondenser { wsp1.count = wsp1.count + wsp.count; words.put(k, wsp1); // remove current word - it.remove(); + wi.remove(); continue wordsearch; } } @@ -550,8 +551,8 @@ public final class plasmaCondenser { } // depending on the orderedSentences structure, we rebuild the sentence - // HashMap to eliminate double occuring sentences - sentences = new HashMap(); + // HashMap to eliminate double occurring sentences + sentences = new HashMap(); int le; for (int i = 0; i < orderedSentences.length; i++) { le = ((String[]) orderedSentences[i]).length; @@ -560,7 +561,7 @@ public final class plasmaCondenser { sentence.append(((String[]) orderedSentences[i])[j]); if (sentences.containsKey(sentence)) { // add sentence counter to counter of found sentence - psp = (phraseStatProp) sentences.get(sentence); + psp = sentences.get(sentence); psp.count = psp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]); sentences.put(sentence, psp); // System.out.println("Found double occurring sentence " + i + " @@ -596,14 +597,14 @@ public final class plasmaCondenser { // this structure is only needed to reconstruct the text String word; wordStatProp wsp; - Map.Entry entry; - Iterator it; + Map.Entry entry; + Iterator> it; String[] orderedWords = new String[words.size() + 99]; // uuiiii, the '99' is only a quick hack... it = words.entrySet().iterator(); // enumerates the keys in ascending order while (it.hasNext()) { - entry = (Map.Entry) it.next(); - word = (String) entry.getKey(); - wsp = (wordStatProp) entry.getValue(); + entry = it.next(); + word = entry.getKey(); + wsp = entry.getValue(); orderedWords[wsp.posInText] = word; } @@ -632,14 +633,14 @@ public final class plasmaCondenser { // we reconstruct the sentence hashtable again and create by-handle ordered entries // this structure is needed to present the strings in the right order in a printout int wc; - Iterator it; phraseStatProp psp; String[] s; StringBuffer sentence; Object[] orderedSentences = new Object[sentences.size()]; - for (int i = 0; i < sentences.size(); i++) + for (int i = 0; i < sentences.size(); i++) { orderedSentences[i] = null; // this array must be initialized - it = sentences.keySet().iterator(); + } + Iterator it = sentences.keySet().iterator(); while (it.hasNext()) { sentence = (StringBuffer) it.next(); wc = (sentence.length() - 1) / numlength; @@ -717,9 +718,9 @@ public final class plasmaCondenser { } - private static class unsievedWordsEnum implements Enumeration { + private static class unsievedWordsEnum implements Enumeration { // returns an enumeration of StringBuffer Objects - Object buffer = null; + StringBuffer buffer = null; sentencesFromInputStreamEnum e; StringBuffer s; @@ -770,8 +771,8 @@ public final class plasmaCondenser { return buffer != null; } - public Object nextElement() { - Object r = buffer; + public StringBuffer nextElement() { + StringBuffer r = buffer; buffer = nextElement0(); return r; } diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index 0be3c8c77..6c631541d 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -91,18 +91,18 @@ public class plasmaCrawlProfile { return profileTable.size(); } - public Iterator profiles(boolean up) { + public Iterator profiles(boolean up) { // enumerates profile entries try { return new profileIterator(up); } catch (IOException e) { - return new HashSet().iterator(); + return new HashSet().iterator(); } } - public class profileIterator implements Iterator { + public class profileIterator implements Iterator { // the iterator iterates all keys, which are byte[] objects - kelondroCloneableIterator handleIterator; + kelondroCloneableIterator handleIterator; String lastkey; public profileIterator(boolean up) throws IOException { handleIterator = profileTable.keys(up, false); @@ -116,7 +116,7 @@ public class plasmaCrawlProfile { return false; } } - public Object next() { + public entry next() { try { lastkey = (String) handleIterator.next(); return getEntry(lastkey); @@ -140,7 +140,7 @@ public class plasmaCrawlProfile { } catch (IOException e) {} } - public entry newEntry(Map mem) { + public entry newEntry(Map mem) { entry ne = new entry(mem); try { profileTable.set(ne.handle(), ne.map()); @@ -466,19 +466,19 @@ public class plasmaCrawlProfile { } public String domName(boolean attr, int index){ - Iterator domnamesi = doms.entrySet().iterator(); + Iterator> domnamesi = doms.entrySet().iterator(); String domname=""; - Map.Entry ey; + Map.Entry ey; DomProfile dp; int i = 0; while ((domnamesi.hasNext()) && (i < index)) { - ey = (Map.Entry) domnamesi.next(); + ey = domnamesi.next(); i++; } if(domnamesi.hasNext()){ - ey = (Map.Entry) domnamesi.next(); - dp = (DomProfile) ey.getValue(); - domname = ((String) ey.getKey()) + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " "); + ey = domnamesi.next(); + dp = ey.getValue(); + domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " "); } return domname; } diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index b66f4d5f4..c4f8dca6b 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -432,12 +432,13 @@ public final class plasmaParser { String fullClassName = plasmaParserPkgName + "." + currentDir.getName() + "." + className; try { // trying to load the parser class by its name - Class parserClass = Class.forName(fullClassName); - Object theParser = parserClass.newInstance(); - if (!(theParser instanceof Parser)) continue; + Class parserClass = Class.forName(fullClassName); + Object theParser0 = (Parser) parserClass.newInstance(); + if (!(theParser0 instanceof Parser)) continue; + Parser theParser = (Parser) theParser0; // testing if all needed libx libraries are available - String[] neededLibx = ((Parser)theParser).getLibxDependences(); + String[] neededLibx = theParser.getLibxDependences(); StringBuffer neededLibxBuf = new StringBuffer(); if (neededLibx != null) { for (int libxId=0; libxId < neededLibx.length; libxId++) { @@ -451,7 +452,7 @@ public final class plasmaParser { } // loading the list of mime-types that are supported by this parser class - Hashtable supportedMimeTypes = ((Parser) theParser).getSupportedMimeTypes(); + Hashtable supportedMimeTypes = theParser.getSupportedMimeTypes(); // creating a parser info object ParserInfo parserInfo = new ParserInfo(); @@ -462,7 +463,7 @@ public final class plasmaParser { parserInfo.parserVersionNr = ((Parser)theParser).getVersion(); parserInfo.parserName = ((Parser) theParser).getName(); - Iterator mimeTypeIterator = supportedMimeTypes.keySet().iterator(); + Iterator mimeTypeIterator = supportedMimeTypes.keySet().iterator(); while (mimeTypeIterator.hasNext()) { String mimeType = (String) mimeTypeIterator.next(); availableParserList.put(mimeType, parserInfo); @@ -490,9 +491,9 @@ public final class plasmaParser { public void close() { // clearing the parser list - Iterator configs = parserConfigList.values().iterator(); + Iterator configs = parserConfigList.values().iterator(); while (configs.hasNext()) { - plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next(); + plasmaParserConfig currentConfig = configs.next(); synchronized (currentConfig.enabledParserList) { currentConfig.enabledParserList.clear(); } @@ -684,29 +685,24 @@ public final class plasmaParser { } public plasmaParserDocument transformScraper(yacyURL location, String mimeType, String charSet, htmlFilterContentScraper scraper) { - try { - String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; - int p = 0; - for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; - plasmaParserDocument ppd = new plasmaParserDocument( - new yacyURL(location.toNormalform(true, true), null), - mimeType, - charSet, - scraper.getKeywords(), - scraper.getTitle(), - scraper.getAuthor(), - sections, - scraper.getDescription(), - scraper.getText(), - scraper.getAnchors(), - scraper.getImages()); - //scraper.close(); - ppd.setFavicon(scraper.getFavicon()); - return ppd; - } catch (MalformedURLException e) { - //e.printStackTrace(); - return null; - } + String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; + int p = 0; + for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; + plasmaParserDocument ppd = new plasmaParserDocument( + location, + mimeType, + charSet, + scraper.getKeywords(), + scraper.getTitle(), + scraper.getAuthor(), + sections, + scraper.getDescription(), + scraper.getText(), + scraper.getAnchors(), + scraper.getImages()); + //scraper.close(); + ppd.setFavicon(scraper.getFavicon()); + return ppd; } /** @@ -737,7 +733,7 @@ public final class plasmaParser { Parser theParser = makeParser(parserClassName); // checking if the created parser really supports the given mimetype - Hashtable supportedMimeTypes = theParser.getSupportedMimeTypes(); + Hashtable supportedMimeTypes = theParser.getSupportedMimeTypes(); if ((supportedMimeTypes != null) && (supportedMimeTypes.containsKey(mimeType))) { parserInfo.incUsageCounter(); return theParser; @@ -751,64 +747,73 @@ public final class plasmaParser { } - static Map allReflinks(Set links) { + static Map allReflinks(Set links) { // links is either a Set of Strings (with urls) or htmlFilterImageEntries // we find all links that are part of a reference inside a url - HashMap v = new HashMap(); - Iterator i = links.iterator(); + HashMap v = new HashMap(); + Iterator i = links.iterator(); Object o; - String url; + yacyURL url; + String u; int pos; - loop: while (i.hasNext()) { + loop: while (i.hasNext()) try { o = i.next(); - if (o instanceof String) url = (String) o; - else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url().toNormalform(true, true); + if (o instanceof yacyURL) url = (yacyURL) o; + else if (o instanceof String) url = new yacyURL((String) o, null); + else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url(); else { assert false; continue; } - if ((pos = url.toLowerCase().indexOf("http://",7)) > 0) { + u = url.toNormalform(true, true); + if ((pos = u.toLowerCase().indexOf("http://",7)) > 0) { i.remove(); - url = url.substring(pos); - while ((pos = url.toLowerCase().indexOf("http://",7)) > 0) url = url.substring(pos); + u = u.substring(pos); + while ((pos = u.toLowerCase().indexOf("http://",7)) > 0) u = u.substring(pos); + url = new yacyURL(u, null); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; } - if ((pos = url.toLowerCase().indexOf("/www.",7)) > 0) { + if ((pos = u.toLowerCase().indexOf("/www.",7)) > 0) { i.remove(); - url = "http:/" + url.substring(pos); - while ((pos = url.toLowerCase().indexOf("/www.",7)) > 0) url = "http:/" + url.substring(pos); + u = "http:/" + u.substring(pos); + while ((pos = u.toLowerCase().indexOf("/www.",7)) > 0) u = "http:/" + u.substring(pos); + url = new yacyURL(u, null); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; } - } + } catch (MalformedURLException e) {} return v; } - static Map allSubpaths(Set links) { + static Map allSubpaths(Set links) { // links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries - HashMap v = new HashMap(); - Iterator i = links.iterator(); + HashMap v = new HashMap(); + Iterator i = links.iterator(); Object o; - String url; + yacyURL url; + String u; int pos; - while (i.hasNext()) { + while (i.hasNext()) try { o = i.next(); - if (o instanceof String) url = (String) o; - else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url().toNormalform(true, true); + if (o instanceof yacyURL) url = (yacyURL) o; + else if (o instanceof String) url = new yacyURL((String) o, null); + else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url(); else { assert false; continue; } - if (url.endsWith("/")) url = url.substring(0, url.length() - 1); - pos = url.lastIndexOf("/"); + u = url.toNormalform(true, true); + if (u.endsWith("/")) u = u.substring(0, u.length() - 1); + pos = u.lastIndexOf("/"); while (pos > 8) { - url = url.substring(0, pos + 1); + u = u.substring(0, pos + 1); + url = new yacyURL(u, null); if (!(v.containsKey(url))) v.put(url, "sub"); - url = url.substring(0, pos); - pos = url.lastIndexOf("/"); + u = u.substring(0, pos); + pos = u.lastIndexOf("/"); } - } + } catch (MalformedURLException e) {} return v; } @@ -883,24 +888,24 @@ public final class plasmaParser { // printing out all parsed sentences if (document != null) { System.out.print("Document titel: "); - System.out.println(document.getTitle()); + System.out.println(document.dc_title()); // found text - final Iterator sentences = document.getSentences(false); + final Iterator sentences = document.getSentences(false); int i = 0; if (sentences != null) while (sentences.hasNext()) { System.out.print("line " + i + ": "); - System.out.println(((StringBuffer) sentences.next()).toString()); + System.out.println(sentences.next().toString()); i++; } // found links int anchorNr = 0; - Map anchors = document.getAnchors(); - Iterator anchorIter = anchors.keySet().iterator(); + Map anchors = document.getAnchors(); + Iterator anchorIter = anchors.keySet().iterator(); while (anchorIter.hasNext()) { - String key = (String) anchorIter.next(); - System.out.println("URL " + anchorNr + ":\t" + key + " | " + anchors.get(key)); + yacyURL key = anchorIter.next(); + System.out.println("URL " + anchorNr + ":\t" + key.toString() + " | " + anchors.get(key)); anchorNr++; } document.close(); @@ -913,9 +918,9 @@ public final class plasmaParser { public static boolean supportedContent(yacyURL url, String mimeType) { if (url == null) throw new NullPointerException(); - Iterator configs = parserConfigList.values().iterator(); + Iterator configs = parserConfigList.values().iterator(); while (configs.hasNext()) { - plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next(); + plasmaParserConfig currentConfig = configs.next(); synchronized (currentConfig.enabledParserList) { if (currentConfig.supportedContent(url, mimeType)) return true; } @@ -944,7 +949,7 @@ public final class plasmaParser { config.initParseableMimeTypes(configStr); } - public static String[] setEnabledParserList(String parserMode, Set mimeTypeSet) { + public static String[] setEnabledParserList(String parserMode, Set mimeTypeSet) { if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException(); plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode); @@ -956,9 +961,9 @@ public final class plasmaParser { } public static boolean supportedFileExtContains(String fileExt) { - Iterator configs = parserConfigList.values().iterator(); + Iterator configs = parserConfigList.values().iterator(); while (configs.hasNext()) { - plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next(); + plasmaParserConfig currentConfig = configs.next(); synchronized (currentConfig.enabledParserList) { if (currentConfig.supportedFileExtContains(fileExt)) return true; } @@ -968,9 +973,9 @@ public final class plasmaParser { } public static boolean supportedMimeTypesContains(String mimeType) { - Iterator configs = parserConfigList.values().iterator(); + Iterator configs = parserConfigList.values().iterator(); while (configs.hasNext()) { - plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next(); + plasmaParserConfig currentConfig = configs.next(); synchronized (currentConfig.enabledParserList) { if (currentConfig.supportedMimeTypesContains(mimeType)) return true; } @@ -985,7 +990,7 @@ public final class plasmaParser { throw new IllegalArgumentException("The object key must be of type string."); // loading class by name - Class moduleClass = Class.forName((String)name); + Class moduleClass = Class.forName((String)name); // instantiating class Parser theParser = (Parser) moduleClass.newInstance(); diff --git a/source/de/anomic/plasma/plasmaParserConfig.java b/source/de/anomic/plasma/plasmaParserConfig.java index 0b3ffad27..808334e99 100644 --- a/source/de/anomic/plasma/plasmaParserConfig.java +++ b/source/de/anomic/plasma/plasmaParserConfig.java @@ -65,12 +65,12 @@ public class plasmaParserConfig { * @see #loadEnabledParserList() * @see #setEnabledParserList(Enumeration) */ - final HashSet enabledParserList = new HashSet(); + final HashSet enabledParserList = new HashSet(); /** * A list of file extensions that are supported by all enabled parsers */ - final HashSet supportedFileExt = new HashSet(); + final HashSet supportedFileExt = new HashSet(); /** * Parsermode this configuration belongs to @@ -134,29 +134,29 @@ public class plasmaParserConfig { } public void initParseableMimeTypes(String enabledMimeTypes) { - HashSet mimeTypes = null; + HashSet mimeTypes = null; if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) { - mimeTypes = new HashSet(); + mimeTypes = new HashSet(); } else { String[] enabledMimeTypeList = enabledMimeTypes.split(","); - mimeTypes = new HashSet(enabledMimeTypeList.length); + mimeTypes = new HashSet(enabledMimeTypeList.length); for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim()); } setEnabledParserList(mimeTypes); } public void enableAllParsers() { - Set availableMimeTypes = plasmaParser.availableParserList.keySet(); + Set availableMimeTypes = plasmaParser.availableParserList.keySet(); setEnabledParserList(availableMimeTypes); } - public String[] setEnabledParserList(Set mimeTypeSet) { + public String[] setEnabledParserList(Set mimeTypeSet) { - HashSet newEnabledParsers = new HashSet(); - HashSet newSupportedFileExt = new HashSet(); + HashSet newEnabledParsers = new HashSet(); + HashSet newSupportedFileExt = new HashSet(); if (mimeTypeSet != null) { - Iterator mimeTypes = mimeTypeSet.iterator(); + Iterator mimeTypes = mimeTypeSet.iterator(); while (mimeTypes.hasNext()) { String mimeType = (String) mimeTypes.next(); if (plasmaParser.availableParserList.containsKey(mimeType)) { @@ -166,7 +166,7 @@ public class plasmaParserConfig { theParser = plasmaParser.makeParser(((ParserInfo)plasmaParser.availableParserList.get(mimeType)).parserClassName); // getting a list of mimeTypes that the parser supports - Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes(); + Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes(); if (parserSupportsMimeTypes != null) { Object supportedExtensions = parserSupportsMimeTypes.get(mimeType); if ((supportedExtensions != null) && @@ -202,9 +202,10 @@ public class plasmaParserConfig { return (String[])newEnabledParsers.toArray(new String[newEnabledParsers.size()]); } - public HashSet getEnabledParserList() { + @SuppressWarnings("unchecked") + public HashSet getEnabledParserList() { synchronized (this.enabledParserList) { - return (HashSet) this.enabledParserList.clone(); + return (HashSet) this.enabledParserList.clone(); } } } \ No newline at end of file diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 6669f6942..014eedefc 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -48,7 +48,6 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.net.MalformedURLException; import de.anomic.server.serverCachedFileOutputStream; import de.anomic.server.serverFileUtils; @@ -67,21 +66,21 @@ import de.anomic.plasma.parser.Parser; public class plasmaParserDocument { - private yacyURL location; // the source url + private yacyURL source; // the source url private String mimeType; // mimeType as taken from http header private String charset; // the charset of the document private List keywords; // most resources provide a keyword field private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result - private StringBuffer author; // author or copyright + private StringBuffer creator; // author or copyright private List sections; // if present: more titles/headlines appearing in the document - private StringBuffer abstrct; // an abstract, if present: short content description + private StringBuffer description; // an abstract, if present: short content description private Object text; // the clear text, all that is visible - private Map anchors; // all links embedded as clickeable entities (anchor tags) + private Map anchors; // all links embedded as clickeable entities (anchor tags) private TreeSet images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. - private Map hyperlinks, audiolinks, videolinks, applinks; + private Map hyperlinks, audiolinks, videolinks, applinks; private Map emaillinks; private yacyURL favicon; private boolean resorted; @@ -90,16 +89,16 @@ public class plasmaParserDocument { protected plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - Object text, Map anchors, TreeSet images) { - this.location = location; + Object text, Map anchors, TreeSet images) { + this.source = location; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.charset = charset; this.keywords = (keywords == null) ? new LinkedList() : Arrays.asList(keywords); this.title = (title == null) ? new StringBuffer() : new StringBuffer(title); - this.author = (author == null) ? new StringBuffer() : new StringBuffer(author); + this.creator = (author == null) ? new StringBuffer() : new StringBuffer(author); this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections); - this.abstrct = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct); - this.anchors = (anchors == null) ? new HashMap(0) : anchors; + this.description = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct); + this.anchors = (anchors == null) ? new HashMap(0) : anchors; this.images = (images == null) ? new TreeSet() : images; this.hyperlinks = null; this.audiolinks = null; @@ -125,32 +124,90 @@ public class plasmaParserDocument { public plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - byte[] text, Map anchors, TreeSet images) { + byte[] text, Map anchors, TreeSet images) { this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } public plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - File text, Map anchors, TreeSet images) { + File text, Map anchors, TreeSet images) { this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } public plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - serverCachedFileOutputStream text, Map anchors, TreeSet images) { + serverCachedFileOutputStream text, Map anchors, TreeSet images) { this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } - public yacyURL getLocation() { - return this.location; + /* +DC according to rfc 5013 + +* dc_title +* dc_creator +* dc_subject +* dc_description +* dc_publisher +dc_contributor +dc_date +dc_type +* dc_format +* dc_identifier +* dc_source +dc_language +dc_relation +dc_coverage +dc_rights + */ + + public String dc_title() { + return title.toString(); + } + + public String dc_creator() { + if (creator != null) return creator.toString(); else return new String(); } - public String getMimeType() { + public String dc_subject(char separator) { + // sort out doubles and empty words + TreeSet hs = new TreeSet(); + String s; + for (int i = 0; i < this.keywords.size(); i++) { + if (this.keywords.get(i) == null) continue; + s = ((String)this.keywords.get(i)).trim(); + if (s.length() > 0) hs.add(s.toLowerCase()); + } + if (hs.size() == 0) return ""; + // generate a new list + StringBuffer sb = new StringBuffer(this.keywords.size() * 6); + Iterator i = hs.iterator(); + while (i.hasNext()) sb.append(i.next()).append(separator); + return sb.substring(0, sb.length() - 1); + } + + public String dc_description() { + if (description != null) return description.toString(); else return dc_title(); + } + + public String dc_publisher() { + // if we don't have a publisher, simply return the host/domain name + return this.source.getHost(); + } + + public String dc_format() { return this.mimeType; } + public String dc_identifier() { + return "yacy.net:" + this.source.hash(); + } + + public yacyURL dc_source() { + return this.source; + } + /** * @return the supposed charset of this document or null if unknown */ @@ -158,26 +215,14 @@ public class plasmaParserDocument { return this.charset; } - public String getTitle() { - return title.toString(); - } - public String[] getSectionTitles() { if (sections != null) { return (String[])sections.toArray(new String[this.sections.size()]); } else { - return new String[] { getTitle() }; + return new String[] { dc_title() }; } } - public String getAbstract() { - if (abstrct != null) return abstrct.toString(); else return getTitle(); - } - - public String getAuthor() { - if (author != null) return author.toString(); else return new String(); - } - public InputStream getText() { try { if (this.text == null) return null; @@ -236,28 +281,11 @@ public class plasmaParserDocument { return e; } - public String getKeywords(char separator) { - // sort out doubles and empty words - TreeSet hs = new TreeSet(); - String s; - for (int i = 0; i < this.keywords.size(); i++) { - if (this.keywords.get(i) == null) continue; - s = ((String)this.keywords.get(i)).trim(); - if (s.length() > 0) hs.add(s.toLowerCase()); - } - if (hs.size() == 0) return ""; - // generate a new list - StringBuffer sb = new StringBuffer(this.keywords.size() * 6); - Iterator i = hs.iterator(); - while (i.hasNext()) sb.append(i.next()).append(separator); - return sb.substring(0, sb.length() - 1); - } - public List getKeywords() { return this.keywords; } - public Map getAnchors() { + public Map getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return anchors; @@ -266,18 +294,18 @@ public class plasmaParserDocument { // the next three methods provide a calculated view on the getAnchors/getImages: - public Map getHyperlinks() { + public Map getHyperlinks() { // this is a subset of the getAnchor-set: only links to other hyperrefs if (!resorted) resortLinks(); return hyperlinks; } - public Map getAudiolinks() { + public Map getAudiolinks() { if (!resorted) resortLinks(); return this.audiolinks; } - public Map getVideolinks() { + public Map getVideolinks() { if (!resorted) resortLinks(); return this.videolinks; } @@ -289,7 +317,7 @@ public class plasmaParserDocument { return images; } - public Map getApplinks() { + public Map getApplinks() { if (!resorted) resortLinks(); return this.applinks; } @@ -307,17 +335,19 @@ public class plasmaParserDocument { String u; int extpos, qpos; String ext = null; - Iterator> i = anchors.entrySet().iterator(); - hyperlinks = new HashMap(); - videolinks = new HashMap(); - audiolinks = new HashMap(); - applinks = new HashMap(); + Iterator> i = anchors.entrySet().iterator(); + hyperlinks = new HashMap(); + videolinks = new HashMap(); + audiolinks = new HashMap(); + applinks = new HashMap(); emaillinks = new HashMap(); TreeSet collectedImages = new TreeSet(); // this is a set that is collected now and joined later to the imagelinks - Map.Entry entry; + Map.Entry entry; while (i.hasNext()) { entry = i.next(); - u = entry.getKey(); + url = entry.getKey(); + if (url == null) continue; + u = url.toNormalform(true, false); if ((u != null) && (u.startsWith("mailto:"))) { emaillinks.put(u.substring(7), entry.getValue()); } else { @@ -328,21 +358,16 @@ public class plasmaParserDocument { } else { ext = u.substring(extpos + 1).toLowerCase(); } - try { - url = new yacyURL(u, null); - u = url.toNormalform(true, true); - if (plasmaParser.mediaExtContains(ext)) { - // this is not a normal anchor, its a media link - if (plasmaParser.imageExtContains(ext)) { - collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1)); - } - else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, (String)entry.getValue()); - else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, (String)entry.getValue()); - else if (plasmaParser.appsExtContains(ext)) applinks.put(u, (String)entry.getValue()); - } else { - hyperlinks.put(u, (String)entry.getValue()); + if (plasmaParser.mediaExtContains(ext)) { + // this is not a normal anchor, its a media link + if (plasmaParser.imageExtContains(ext)) { + collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1)); } - } catch (MalformedURLException e1) { + else if (plasmaParser.audioExtContains(ext)) audiolinks.put(url, (String)entry.getValue()); + else if (plasmaParser.videoExtContains(ext)) videolinks.put(url, (String)entry.getValue()); + else if (plasmaParser.appsExtContains(ext)) applinks.put(url, (String)entry.getValue()); + } else { + hyperlinks.put(url, (String) entry.getValue()); } } } @@ -378,12 +403,12 @@ public class plasmaParserDocument { this.sections.addAll(Arrays.asList(doc.getSectionTitles())); if (this.title.length() > 0) this.title.append('\n'); - this.title.append(doc.getTitle()); + this.title.append(doc.dc_title()); this.keywords.addAll(doc.getKeywords()); - if (this.abstrct.length() > 0) this.abstrct.append('\n'); - this.abstrct.append(doc.getAbstract()); + if (this.description.length() > 0) this.description.append('\n'); + this.description.append(doc.dc_description()); if (!(this.text instanceof serverCachedFileOutputStream)) { this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE); diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index 4b0005544..90ae0ec01 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -79,11 +79,11 @@ public final class plasmaSearchImages { // add also links from pages one step deeper, if depth > 0 if (depth > 0) { - Iterator i = document.getHyperlinks().keySet().iterator(); + Iterator i = document.getHyperlinks().keySet().iterator(); String nexturlstring; while (i.hasNext()) { try { - nexturlstring = new yacyURL(i.next(), null).toNormalform(true, true); + nexturlstring = i.next().toNormalform(true, true); addAll(new plasmaSearchImages(serverDate.remainingTime(start, maxTime, 10), new yacyURL(nexturlstring, null), depth - 1)); } catch (MalformedURLException e1) { e1.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 27aa87cef..2582b0671 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -228,8 +228,9 @@ public class plasmaSnippetCache { public static class MediaSnippet { public int type; - public String href, name, attr; - public MediaSnippet(int type, String href, String name, String attr) { + public yacyURL href; + public String name, attr; + public MediaSnippet(int type, yacyURL href, String name, String attr) { this.type = type; this.href = href; this.name = name; @@ -469,17 +470,18 @@ public class plasmaSnippetCache { return snippetsCache.get(key); } - private static String computeMediaSnippet(Map media, Set queryhashes) { - Iterator> i = media.entrySet().iterator(); - Map.Entry entry; - String url, desc; + private static String computeMediaSnippet(Map media, Set queryhashes) { + Iterator> i = media.entrySet().iterator(); + Map.Entry entry; + yacyURL url; + String desc; Set s; String result = ""; while (i.hasNext()) { entry = i.next(); url = entry.getKey(); desc = entry.getValue(); - s = removeAppearanceHashes(url, queryhashes); + s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); if (s.size() == 0) { result += "
" + ((desc.length() == 0) ? url : desc) + ""; continue; @@ -643,22 +645,23 @@ public class plasmaSnippetCache { public static ArrayList computeMediaSnippets(plasmaParserDocument document, Set queryhashes, int mediatype) { if (document == null) return new ArrayList(); - Map media = null; + Map media = null; if (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO) media = document.getAudiolinks(); else if (mediatype == plasmaSearchQuery.CONTENTDOM_VIDEO) media = document.getVideolinks(); else if (mediatype == plasmaSearchQuery.CONTENTDOM_APP) media = document.getApplinks(); if (media == null) return null; - Iterator> i = media.entrySet().iterator(); - Map.Entry entry; - String url, desc; + Iterator> i = media.entrySet().iterator(); + Map.Entry entry; + yacyURL url; + String desc; Set s; ArrayList result = new ArrayList(); while (i.hasNext()) { entry = i.next(); url = entry.getKey(); desc = entry.getValue(); - s = removeAppearanceHashes(url, queryhashes); + s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); if (s.size() == 0) { result.add(new MediaSnippet(mediatype, url, desc, null)); continue; @@ -678,14 +681,15 @@ public class plasmaSnippetCache { Iterator i = images.iterator(); htmlFilterImageEntry ientry; - String url, desc; + yacyURL url; + String desc; Set s; ArrayList result = new ArrayList(); while (i.hasNext()) { ientry = i.next(); - url = ientry.url().toNormalform(true, true); + url = ientry.url(); desc = ientry.alt(); - s = removeAppearanceHashes(url, queryhashes); + s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); if (s.size() == 0) { result.add(new MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height())); continue; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 9f1fa0262..f2b038ffe 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -138,7 +138,7 @@ import de.anomic.plasma.crawler.plasmaCrawlQueues; import de.anomic.plasma.crawler.plasmaProtocolLoader; import de.anomic.plasma.dbImport.dbImportManager; import de.anomic.plasma.parser.ParserException; -import de.anomic.plasma.plasmaSwitchboardQueue.Entry; +import de.anomic.plasma.plasmaCondenser.wordStatProp; import de.anomic.plasma.urlPattern.defaultURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverAbstractSwitch; @@ -998,8 +998,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser this.log.logConfig("Starting blacklist engine ..."); try { - Class blacklistClass = Class.forName(blacklistClassName); - Constructor blacklistClassConstr = blacklistClass.getConstructor( new Class[] { File.class } ); + Class blacklistClass = Class.forName(blacklistClassName); + Constructor blacklistClassConstr = blacklistClass.getConstructor( new Class[] { File.class } ); urlBlacklist = (plasmaURLPattern) blacklistClassConstr.newInstance(new Object[] { blacklistsPath }); this.log.logFine("Used blacklist engine class: " + blacklistClassName); this.log.logConfig("Using blacklist engine: " + urlBlacklist.getEngineInfo()); @@ -1276,8 +1276,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String wikiParserClassName = getConfig(WIKIPARSER_CLASS, WIKIPARSER_CLASS_DEFAULT); this.log.logConfig("Loading wiki parser " + wikiParserClassName + " ..."); try { - Class wikiParserClass = Class.forName(wikiParserClassName); - Constructor wikiParserClassConstr = wikiParserClass.getConstructor(new Class[] { plasmaSwitchboard.class }); + Class wikiParserClass = Class.forName(wikiParserClassName); + Constructor wikiParserClassConstr = wikiParserClass.getConstructor(new Class[] { plasmaSwitchboard.class }); wikiParser = (wikiParser)wikiParserClassConstr.newInstance(new Object[] { this }); } catch (Exception e) { this.log.logSevere("Unable to load wiki parser, the wiki won't work", e); @@ -1532,11 +1532,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser this.defaultRemoteProfile = null; this.defaultTextSnippetProfile = null; this.defaultMediaSnippetProfile = null; - Iterator i = this.profilesActiveCrawls.profiles(true); + Iterator i = this.profilesActiveCrawls.profiles(true); plasmaCrawlProfile.entry profile; String name; while (i.hasNext()) { - profile = (plasmaCrawlProfile.entry) i.next(); + profile = i.next(); name = profile.name(); if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile; if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile; @@ -1606,7 +1606,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser (crawlStacker != null && crawlStacker.size() > 0) || (crawlQueues.noticeURL.notEmpty())) return false; - final Iterator iter = profilesActiveCrawls.profiles(true); + final Iterator iter = profilesActiveCrawls.profiles(true); plasmaCrawlProfile.entry entry; boolean hasDoneSomething = false; try { @@ -1615,7 +1615,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress"); // getting next profile - entry = (plasmaCrawlProfile.entry) iter.next(); + entry = iter.next(); if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) || (entry.name().equals(CRAWL_PROFILE_REMOTE)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_TEXT)) || @@ -1961,13 +1961,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // clean up seed-dbs if(getConfigBool("routing.deleteOldSeeds.permission",true)) { final long deleteOldSeedsTime = getConfigLong("routing.deleteOldSeeds.time",7)*24*3600000; - Iterator e = yacyCore.seedDB.seedsSortedDisconnected(true,yacySeed.LASTSEEN); + Iterator e = yacyCore.seedDB.seedsSortedDisconnected(true,yacySeed.LASTSEEN); yacySeed seed = null; - ArrayList deleteQueue = new ArrayList(); + ArrayList deleteQueue = new ArrayList(); checkInterruption(); //clean passive seeds while(e.hasNext()) { - seed = (yacySeed)e.next(); + seed = e.next(); if(seed != null) { //list is sorted -> break when peers are too young to delete if(seed.getLastSeenUTC() > (System.currentTimeMillis()-deleteOldSeedsTime)) @@ -2183,24 +2183,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ((processCase == PROCESSCASE_4_PROXY_LOAD) || (processCase == PROCESSCASE_5_LOCAL_CRAWLING)) && ((entry.profile() == null) || (entry.depth() < entry.profile().generalDepth())) ) { - Map hl = document.getHyperlinks(); - Iterator i = hl.entrySet().iterator(); - String nextUrlString; + Map hl = document.getHyperlinks(); + Iterator> i = hl.entrySet().iterator(); yacyURL nextUrl; - Map.Entry nextEntry; + Map.Entry nextEntry; while (i.hasNext()) { // check for interruption checkInterruption(); // fetching the next hyperlink - nextEntry = (Map.Entry) i.next(); - nextUrlString = (String) nextEntry.getKey(); - try { - nextUrl = new yacyURL(nextUrlString, null); - - // enqueue the hyperlink into the pre-notice-url db - crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile()); - } catch (MalformedURLException e1) {} + nextEntry = i.next(); + nextUrl = nextEntry.getKey(); + // enqueue the hyperlink into the pre-notice-url db + crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile()); } log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) + ", NEW CRAWL STACK SIZE IS " + crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); @@ -2210,7 +2205,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser /* ========================================================================= * CREATE INDEX * ========================================================================= */ - String docDescription = document.getTitle(); + String docDescription = document.dc_title(); yacyURL referrerURL = entry.referrerURL(); String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR; @@ -2242,8 +2237,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser indexURLEntry newEntry = new indexURLEntry( entry.url(), // URL docDescription, // document description - document.getAuthor(), // author - document.getKeywords(' '), // tags + document.dc_creator(), // author + document.dc_subject(' '), // tags "", // ETag docDate, // modification date new Date(), // loaded date @@ -2252,7 +2247,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser new byte[0], // md5 (int) entry.size(), // size condenser.RESULT_NUMB_WORDS, // word count - plasmaHTCache.docType(document.getMimeType()), // doctype + plasmaHTCache.docType(document.dc_format()), // doctype condenser.RESULT_FLAGS, // flags yacyURL.language(entry.url()), // language ioLinks[0].intValue(), // llocal @@ -2314,7 +2309,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser document, // document content condenser, // document condenser yacyURL.language(entry.url()), // document language - plasmaHTCache.docType(document.getMimeType()),// document type + plasmaHTCache.docType(document.dc_format()),// document type ioLinks[0].intValue(), // outlinkSame ioLinks[1].intValue() // outlinkOthers ); @@ -2322,31 +2317,31 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser /* ======================================================================== * SEND PAGE INDEX TO STORAGE PEER * ======================================================================== */ - HashMap urlCache = new HashMap(1); - urlCache.put(newEntry.hash(),newEntry); + HashMap urlCache = new HashMap(1); + urlCache.put(newEntry.hash(), newEntry); - ArrayList tmpContainers = new ArrayList(condenser.words().size()); + ArrayList tmpContainers = new ArrayList(condenser.words().size()); String language = yacyURL.language(entry.url()); - char doctype = plasmaHTCache.docType(document.getMimeType()); + char doctype = plasmaHTCache.docType(document.dc_format()); indexURLEntry.Components comp = newEntry.comp(); int urlLength = comp.url().toNormalform(true, true).length(); int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform(true, true)).length; // iterate over all words - Iterator i = condenser.words().entrySet().iterator(); - Map.Entry wentry; + Iterator> i = condenser.words().entrySet().iterator(); + Map.Entry wentry; plasmaCondenser.wordStatProp wordStat; while (i.hasNext()) { - wentry = (Map.Entry) i.next(); - String word = (String) wentry.getKey(); - wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); + wentry = i.next(); + String word = wentry.getKey(); + wordStat = wentry.getValue(); String wordHash = plasmaCondenser.word2hash(word); indexRWIEntry wordIdxEntry = new indexRWIRowEntry( urlHash, urlLength, urlComps, wordStat.count, - document.getTitle().length(), + document.dc_title().length(), condenser.words().size(), condenser.sentences().size(), wordStat.posInText, @@ -2371,7 +2366,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // transfering the index to the storage peer indexContainer[] indexData = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]); - HashMap resultObj = yacyClient.transferIndex( + HashMap resultObj = yacyClient.transferIndex( seed, // target seed indexData, // word index data urlCache, // urls @@ -2392,7 +2387,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser document, condenser, yacyURL.language(entry.url()), - plasmaHTCache.docType(document.getMimeType()), + plasmaHTCache.docType(document.dc_format()), ioLinks[0].intValue(), ioLinks[1].intValue() ); @@ -2412,7 +2407,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logInfo("*Indexed " + words + " words in URL " + entry.url() + " [" + entry.urlHash() + "]" + "\n\tDescription: " + docDescription + - "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getCharset() + " | " + + "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " + "Size: " + document.getTextLength() + " bytes | " + "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) + "\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " + @@ -2548,7 +2543,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser plasmaParserDocument document = plasmaSnippetCache.parseDocument(comp.url(), resourceContentLength.longValue(), resourceContent); // get the word set - Set words = null; + Set words = null; try { words = new plasmaCondenser(document, true, true).words().keySet(); } catch (UnsupportedEncodingException e) { @@ -2642,10 +2637,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } - public static int accessFrequency(HashMap tracker, String host) { + public static int accessFrequency(HashMap> tracker, String host) { // returns the access frequency in queries per hour for a given host and a specific tracker long timeInterval = 1000 * 60 * 60; - TreeSet accessSet = (TreeSet) tracker.get(host); + TreeSet accessSet = tracker.get(host); if (accessSet == null) return 0; return accessSet.tailSet(new Long(System.currentTimeMillis() - timeInterval)).size(); } @@ -2769,7 +2764,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser try { // find a list of DHT-peers double maxDist = 0.2; - ArrayList seeds = yacyCore.dhtAgent.getDHTTargets(log, peerCount, Math.min(8, (int) (yacyCore.seedDB.sizeConnected() * maxDist)), dhtChunk.firstContainer().getWordHash(), dhtChunk.lastContainer().getWordHash(), maxDist); + ArrayList seeds = yacyCore.dhtAgent.getDHTTargets(log, peerCount, Math.min(8, (int) (yacyCore.seedDB.sizeConnected() * maxDist)), dhtChunk.firstContainer().getWordHash(), dhtChunk.lastContainer().getWordHash(), maxDist); if (seeds.size() < peerCount) { log.logWarning("found not enough (" + seeds.size() + ") peers for distribution for dhtchunk [" + dhtChunk.firstContainer().getWordHash() + " .. " + dhtChunk.lastContainer().getWordHash() + "]"); return false; @@ -2784,8 +2779,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser int retries = 0; // starting up multiple DHT transfer threads - Iterator seedIter = seeds.iterator(); - ArrayList transfer = new ArrayList(peerCount); + Iterator seedIter = seeds.iterator(); + ArrayList transfer = new ArrayList(peerCount); while (hc1 < peerCount && (transfer.size() > 0 || seedIter.hasNext())) { // starting up some transfer threads @@ -2804,12 +2799,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // waiting for the transfer threads to finish - Iterator transferIter = transfer.iterator(); + Iterator transferIter = transfer.iterator(); while (transferIter.hasNext()) { // check for interruption checkInterruption(); - plasmaDHTTransfer t = (plasmaDHTTransfer)transferIter.next(); + plasmaDHTTransfer t = transferIter.next(); if (!t.isAlive()) { // remove finished thread from the list transferIter.remove(); diff --git a/source/de/anomic/plasma/plasmaWebStructure.java b/source/de/anomic/plasma/plasmaWebStructure.java index ec12f459c..b72bb5521 100644 --- a/source/de/anomic/plasma/plasmaWebStructure.java +++ b/source/de/anomic/plasma/plasmaWebStructure.java @@ -29,14 +29,13 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; -import java.net.MalformedURLException; import java.util.ConcurrentModificationException; import java.util.Date; +import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import java.util.HashMap; -import java.util.TreeMap; import java.util.SortedMap; +import java.util.TreeMap; import java.util.TreeSet; import de.anomic.kelondro.kelondroBase64Order; @@ -97,8 +96,8 @@ public class plasmaWebStructure { assert url.hash().equals(baseurlhash); // generate citation reference - Map hl = document.getHyperlinks(); - Iterator it = hl.keySet().iterator(); + Map hl = document.getHyperlinks(); + Iterator it = hl.keySet().iterator(); String nexturlhash; StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1); StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1); @@ -106,20 +105,18 @@ public class plasmaWebStructure { int GCount = 0; int LCount = 0; while (it.hasNext()) { - try { - nexturlhash = (new yacyURL(it.next(), null)).hash(); - if (nexturlhash != null) { - if (nexturlhash.substring(6).equals(lhp)) { - // this is a inbound link - cpl.append(nexturlhash.substring(0, 6)); // store only local part - LCount++; - } else { - // this is a outbound link - cpg.append(nexturlhash); // store complete hash - GCount++; - } + nexturlhash = it.next().hash(); + if (nexturlhash != null) { + if (nexturlhash.substring(6).equals(lhp)) { + // this is a inbound link + cpl.append(nexturlhash.substring(0, 6)); // store only local part + LCount++; + } else { + // this is a outbound link + cpg.append(nexturlhash); // store complete hash + GCount++; } - } catch (MalformedURLException e) {} + } } // append this reference to buffer diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index e66896867..82019cf44 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -301,7 +301,7 @@ public final class plasmaWordIndex implements indexRI { wprop = wentry.getValue(); assert (wprop.flags != null); ientry = new indexRWIRowEntry(url.hash(), - urlLength, urlComps, (document == null) ? urlLength : document.getTitle().length(), + urlLength, urlComps, (document == null) ? urlLength : document.dc_title().length(), wprop.count, condenser.words().size(), condenser.sentences().size(), diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index ecf42330d..5919c7917 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -55,6 +55,7 @@ import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpc; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIRowEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; @@ -575,19 +576,19 @@ public final class yacyClient { // read index abstract if (abstractCache != null) { - Iterator i = result.entrySet().iterator(); - Map.Entry entry; - TreeMap singleAbstract; + Iterator> i = result.entrySet().iterator(); + Map.Entry entry; + TreeMap singleAbstract; String wordhash; serverByteBuffer ci; while (i.hasNext()) { - entry = (Map.Entry) i.next(); - if (((String) entry.getKey()).startsWith("indexabstract.")) { - wordhash = ((String) entry.getKey()).substring(14); + entry = i.next(); + if (entry.getKey().startsWith("indexabstract.")) { + wordhash = entry.getKey().substring(14); synchronized (abstractCache) { singleAbstract = (TreeMap) abstractCache.get(wordhash); // a mapping from url-hashes to a string of peer-hashes - if (singleAbstract == null) singleAbstract = new TreeMap(); - ci = new serverByteBuffer(((String) entry.getValue()).getBytes()); + if (singleAbstract == null) singleAbstract = new TreeMap(); + ci = new serverByteBuffer(entry.getValue().getBytes()); //System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString()); indexContainer.decompressIndex(singleAbstract, ci, target.hash); abstractCache.put(wordhash, singleAbstract); @@ -621,7 +622,7 @@ public final class yacyClient { return urls; } - public static HashMap permissionMessage(String targetHash) { + public static HashMap permissionMessage(String targetHash) { // ask for allowed message size and attachement size // if this replies null, the peer does not answer if (yacyCore.seedDB == null || yacyCore.seedDB.mySeed() == null) { return null; } @@ -651,7 +652,7 @@ public final class yacyClient { } } - public static HashMap postMessage(String targetHash, String subject, byte[] message) { + public static HashMap postMessage(String targetHash, String subject, byte[] message) { // this post a message to the remote message board // prepare request @@ -699,7 +700,7 @@ public final class yacyClient { return address; } - public static HashMap transferPermission(String targetAddress, long filesize, String filename) { + public static HashMap transferPermission(String targetAddress, long filesize, String filename) { // prepare request final serverObjects post = yacyNetwork.basicRequestPost(plasmaSwitchboard.getSwitchboard(), null); @@ -731,7 +732,7 @@ public final class yacyClient { } } - public static HashMap transferStore(String targetAddress, String access, String filename, byte[] file) { + public static HashMap transferStore(String targetAddress, String access, String filename, byte[] file) { // prepare request final serverObjects post = yacyNetwork.basicRequestPost(plasmaSwitchboard.getSwitchboard(), null); @@ -741,7 +742,7 @@ public final class yacyClient { post.put("filesize", Long.toString(file.length)); post.put("md5", serverCodings.encodeMD5Hex(file)); post.put("access", access); - HashMap files = new HashMap(); + HashMap files = new HashMap(); files.put("filename", file); // send request @@ -766,7 +767,7 @@ public final class yacyClient { } public static String transfer(String targetAddress, String filename, byte[] file) { - HashMap phase1 = transferPermission(targetAddress, file.length, filename); + HashMap phase1 = transferPermission(targetAddress, file.length, filename); if (phase1 == null) return "no connection to remote address " + targetAddress + "; phase 1"; String access = (String) phase1.get("access"); String nextaddress = (String) phase1.get("address"); @@ -778,7 +779,7 @@ public final class yacyClient { if (!(response.equals("ok"))) return "remote peer rejected transfer: " + response; String accesscode = serverCodings.encodeMD5Hex(kelondroBase64Order.standardCoder.encodeString(access)); if (protocol.equals("http")) { - HashMap phase2 = transferStore(nextaddress, accesscode, filename, file); + HashMap phase2 = transferStore(nextaddress, accesscode, filename, file); if (phase2 == null) return "no connection to remote address " + targetAddress + "; phase 2"; response = (String) phase2.get("response"); if (response == null) return "wrong return values from other peer; phase 2"; @@ -848,14 +849,14 @@ public final class yacyClient { } } - public static HashMap transferIndex(yacySeed targetSeed, indexContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) { + public static HashMap transferIndex(yacySeed targetSeed, indexContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) { - HashMap resultObj = new HashMap(); + HashMap resultObj = new HashMap(); int payloadSize = 0; try { // check if we got all necessary urls in the urlCache (only for debugging) - Iterator eenum; + Iterator eenum; indexRWIEntry entry; for (int i = 0; i < indexes.length; i++) { eenum = indexes[i].entries(); @@ -879,13 +880,13 @@ public final class yacyClient { String result = (String) in.get("result"); if (result == null) { - resultObj.put("result","no_result_1"); + resultObj.put("result", "no_result_1"); return resultObj; } if (!(result.equals("ok"))) { targetSeed.setFlagAcceptRemoteIndex(false); yacyCore.seedDB.update(targetSeed.hash, targetSeed); - resultObj.put("result",result); + resultObj.put("result", result); return resultObj; } @@ -938,7 +939,7 @@ public final class yacyClient { } } - private static HashMap transferRWI(yacySeed targetSeed, indexContainer[] indexes, boolean gzipBody, int timeout) { + private static HashMap transferRWI(yacySeed targetSeed, indexContainer[] indexes, boolean gzipBody, int timeout) { final String address = targetSeed.getPublicAddress(); if (address == null) { return null; } @@ -953,7 +954,7 @@ public final class yacyClient { int indexcount = 0; final StringBuffer entrypost = new StringBuffer(indexes.length*73); - Iterator eenum; + Iterator eenum; indexRWIEntry entry; for (int i = 0; i < indexes.length; i++) { eenum = indexes[i].entries(); @@ -968,7 +969,7 @@ public final class yacyClient { if (indexcount == 0) { // nothing to do but everything ok - final HashMap result = new HashMap(2); + final HashMap result = new HashMap(2); result.put("result", "ok"); result.put("unknownURL", ""); return result; diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index d3c1ebd00..d63e343ac 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -891,7 +891,7 @@ public class yacyURL { } public int hashCode() { - return this.toString().hashCode(); + return this.hash().hashCode(); } public int compareTo(Object h) { diff --git a/source/de/anomic/yacy/yacyVersion.java b/source/de/anomic/yacy/yacyVersion.java index 32742ca39..6683387e5 100644 --- a/source/de/anomic/yacy/yacyVersion.java +++ b/source/de/anomic/yacy/yacyVersion.java @@ -29,7 +29,6 @@ package de.anomic.yacy; import java.io.File; import java.io.IOException; -import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; @@ -298,17 +297,13 @@ public final class yacyVersion implements Comparator, Comparable anchors = scraper.getAnchors(); // a url (String) / name (String) relation - Iterator i = anchors.keySet().iterator(); + Map anchors = scraper.getAnchors(); // a url (String) / name (String) relation + Iterator i = anchors.keySet().iterator(); TreeSet devreleases = new TreeSet(); TreeSet mainreleases = new TreeSet(); yacyVersion release; while (i.hasNext()) { - try { - url = new yacyURL((String) i.next(), null); - } catch (MalformedURLException e1) { - continue; // just ignore invalid urls - } + url = i.next(); try { release = new yacyVersion(url); //System.out.println("r " + release.toAnchor());