- added parsing of Dublin Core - compliant metadata (see RFC 5013 and ISO 15836) to html parser

- refactoring of plasmaParserDocument to use Dublin Core - compatible property names
- redesign of url handling in parser and condenser (less String-to-yacyURL conversion)
- more generics

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4352 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent cfd4fecd12
commit efd0b8371a

@ -179,9 +179,9 @@ public class Bookmarks {
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", comp.url().toNormalform(false, true));
prop.putHTML("mode_title", comp.title());
prop.putHTML("mode_description", (document == null) ? comp.title(): document.getTitle());
prop.putHTML("mode_description", (document == null) ? comp.title(): document.dc_title());
prop.putHTML("mode_author", comp.author());
prop.putHTML("mode_tags", (document == null) ? comp.tags() : document.getKeywords(','));
prop.putHTML("mode_tags", (document == null) ? comp.tags() : document.dc_subject(','));
prop.put("mode_public", "0");
prop.put("mode_feed", "0"); //TODO: check if it IS a feed
}

@ -297,7 +297,7 @@ public class ViewFile {
}
}
resMime = document.getMimeType();
resMime = document.dc_format();
String[] wordArray = wordArray(post.get("words", null));
if (viewMode.equals("parsed")) {
@ -310,7 +310,7 @@ public class ViewFile {
} else if (viewMode.equals("sentences")) {
prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
final Iterator sentences = document.getSentences(pre);
final Iterator<StringBuffer> sentences = document.getSentences(pre);
boolean dark = true;
int i = 0;
@ -319,7 +319,7 @@ public class ViewFile {
// Search word highlighting
while (sentences.hasNext()) {
sentence = ((StringBuffer) sentences.next()).toString();
sentence = sentences.next().toString();
if (sentence.trim().length() > 0) {
prop.put("viewMode_sentences_" + i + "_nr", i + 1);
prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, sentence));
@ -339,11 +339,11 @@ public class ViewFile {
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
dark = (i % 2 == 0);
TreeSet ts = document.getImages();
Iterator tsi = ts.iterator();
TreeSet<htmlFilterImageEntry> ts = document.getImages();
Iterator<htmlFilterImageEntry> tsi = ts.iterator();
htmlFilterImageEntry entry;
while (tsi.hasNext()) {
entry = (htmlFilterImageEntry) tsi.next();
entry = tsi.next();
prop.put("viewMode_links_" + i + "_nr", i);
prop.put("viewMode_links_" + i + "_dark", dark ? "1" : "0");
prop.put("viewMode_links_" + i + "_type", "image");
@ -399,9 +399,9 @@ public class ViewFile {
return message;
}
private static int putMediaInfo(serverObjects prop, String[] wordArray, int c, Map<String, String> media, String name, boolean dark) {
Iterator<Map.Entry<String, String>> mi = media.entrySet().iterator();
Map.Entry<String, String> entry;
private static int putMediaInfo(serverObjects prop, String[] wordArray, int c, Map<yacyURL, String> media, String name, boolean dark) {
Iterator<Map.Entry<yacyURL, String>> mi = media.entrySet().iterator();
Map.Entry<yacyURL, String> entry;
int i = 0;
while (mi.hasNext()) {
entry = mi.next();
@ -409,8 +409,8 @@ public class ViewFile {
prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0));
prop.putHTML("viewMode_links_" + c + "_type", name);
prop.put("viewMode_links_" + c + "_text", markup(wordArray, (String) entry.getValue()));
prop.put("viewMode_links_" + c + "_link", markup(wordArray, (String) entry.getKey()));
prop.put("viewMode_links_" + c + "_url", entry.getKey());
prop.put("viewMode_links_" + c + "_link", markup(wordArray, entry.getKey().toNormalform(true, false)));
prop.put("viewMode_links_" + c + "_url", entry.getKey().toNormalform(true, false));
prop.putHTML("viewMode_links_" + c + "_attr", "");
dark = !dark;
c++;

@ -241,9 +241,9 @@ public class yacysearch {
HashMap<String, String> map = new HashMap<String, String>();
map.put("url", comp.url().toNormalform(false, true).replace(',', '|'));
map.put("title", comp.title().replace(',', ' '));
map.put("description", ((document == null) ? comp.title() : document.getTitle()).replace(',', ' '));
map.put("author", ((document == null) ? "" : document.getAuthor()));
map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
map.put("description", ((document == null) ? comp.title() : document.dc_title()).replace(',', ' '));
map.put("author", ((document == null) ? "" : document.dc_creator()));
map.put("tags", ((document == null) ? "" : document.dc_subject(' ')));
yacyCore.newsPool.publishMyNews(yacyNewsRecord.newRecord(yacyNewsPool.CATEGORY_SURFTIPP_ADD, map));
document.close();
}

@ -230,13 +230,11 @@ public class yacysearchitem {
ArrayList<plasmaSnippetCache.MediaSnippet> images = result.mediaSnippets();
if (images != null) {
plasmaSnippetCache.MediaSnippet ms;
yacyURL url;
int c = 0;
for (int i = 0; i < images.size(); i++) {
ms = (plasmaSnippetCache.MediaSnippet) images.get(i);
try {url = new yacyURL(ms.href, null);} catch (MalformedURLException e) {continue;}
prop.putHTML("content_items_" + i + "_href", ms.href);
prop.put("content_items_" + i + "_code", sb.licensedURLs.aquireLicense(url));
prop.putHTML("content_items_" + i + "_href", ms.href.toNormalform(true, false));
prop.put("content_items_" + i + "_code", sb.licensedURLs.aquireLicense(ms.href));
prop.putHTML("content_items_" + i + "_name", shorten(ms.name, namelength));
prop.put("content_items_" + i + "_attr", ms.attr); // attributes, here: original size of image
c++;
@ -260,8 +258,8 @@ public class yacysearchitem {
int c = 0;
for (int i = 0; i < media.size(); i++) {
ms = (plasmaSnippetCache.MediaSnippet) media.get(i);
prop.putHTML("content_items_" + i + "_href", ms.href);
prop.putHTML("content_items_" + i + "_hrefshort", nxTools.shortenURLString(ms.href, urllength));
prop.putHTML("content_items_" + i + "_href", ms.href.toNormalform(true, false));
prop.putHTML("content_items_" + i + "_hrefshort", nxTools.shortenURLString(ms.href.toNormalform(true, false), urllength));
prop.putHTML("content_items_" + i + "_name", shorten(ms.name, namelength));
prop.put("content_items_" + i + "_col", (col) ? "0" : "1");
c++;

@ -100,12 +100,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
// class variables: collectors for links
private HashMap<String, String> anchors;
private HashMap<yacyURL, String> anchors;
private TreeSet<htmlFilterImageEntry> images; // String(absolute url)/ImageEntry relation
private HashMap<String, String> metas;
private String title;
//private String headline;
private List[] headlines;
private List<String>[] headlines;
private serverCharBuffer content;
private EventListenerList htmlFilterEventListeners = new EventListenerList();
@ -119,12 +119,13 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
*/
private yacyURL root;
@SuppressWarnings("unchecked")
public htmlFilterContentScraper(yacyURL root) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
this.root = root;
this.anchors = new HashMap<String, String>();
this.anchors = new HashMap<yacyURL, String>();
this.images = new TreeSet<htmlFilterImageEntry>();
this.metas = new HashMap<String, String>();
this.title = "";
@ -159,11 +160,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return normalizedURL.toLowerCase().split(splitrex); // word components of the url
}
private String absolutePath(String relativePath) {
private yacyURL absolutePath(String relativePath) {
try {
return yacyURL.newURL(root, relativePath).toNormalform(false, true);
return yacyURL.newURL(root, relativePath);
} catch (Exception e) {
return "";
return null;
}
}
@ -174,11 +175,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
width = Integer.parseInt(tagopts.getProperty("width", "-1"));
height = Integer.parseInt(tagopts.getProperty("height", "-1"));
} catch (NumberFormatException e) {}
try {
yacyURL url = new yacyURL(absolutePath(tagopts.getProperty("src", "")), null);
htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height);
images.add(ie);
} catch (MalformedURLException e) {}
yacyURL url = absolutePath(tagopts.getProperty("src", ""));
htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height);
images.add(ie);
}
if (tagname.equalsIgnoreCase("base")) try {
root = new yacyURL(tagopts.getProperty("href", ""), null);
@ -204,10 +203,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (href.length() > 0) anchors.put(absolutePath(href), areatitle);
}
if (tagname.equalsIgnoreCase("link")) {
yacyURL newLink = null;
try {
newLink = new yacyURL(absolutePath(tagopts.getProperty("href", "")), null);
} catch (MalformedURLException e) {}
yacyURL newLink = absolutePath(tagopts.getProperty("href", ""));
if (newLink != null) {
String type = tagopts.getProperty("rel", "");
@ -218,7 +214,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
images.add(ie);
this.favicon = newLink;
} else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
anchors.put(newLink.toString(), linktitle);
anchors.put(newLink, linktitle);
}
}
}
@ -346,7 +342,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
}
public Map<String, String> getAnchors() {
public Map<yacyURL, String> getAnchors() {
// returns a url (String) / name (String) relation
return anchors;
}
@ -367,31 +363,44 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return this.favicon;
}
/*
DC in html example:
<meta name="DC.title" lang="en" content="Expressing Dublin Core in HTML/XHTML meta and link elements" />
<meta name="DC.creator" content="Andy Powell, UKOLN, University of Bath" />
<meta name="DC.identifier" scheme="DCTERMS.URI" content="http://dublincore.org/documents/dcq-html/" />
<meta name="DC.format" scheme="DCTERMS.IMT" content="text/html" />
<meta name="DC.type" scheme="DCTERMS.DCMIType" content="Text" />
*/
public String getDescription() {
String s = (String) metas.get("description");
String s = metas.get("description");
if (s == null) s = metas.get("DC.description");
if (s == null) return ""; else return s;
}
public String getContentType() {
String s = (String) metas.get("content-type");
String s = metas.get("content-type");
if (s == null) return ""; else return s;
}
public String getAuthor() {
String s = (String) metas.get("author");
if (s == null) s = (String) metas.get("copyright");
String s = metas.get("author");
if (s == null) s = metas.get("copyright");
if (s == null) s = metas.get("DC.creator");
if (s == null) return "";
return s;
}
public String[] getContentLanguages() {
String s = (String) metas.get("content-language");
String s = metas.get("content-language");
if (s == null) s = metas.get("DC.language");
if (s == null) s = "";
return s.split(" |,");
}
public String[] getKeywords() {
String s = (String) metas.get("keywords");
String s = metas.get("keywords");
if (s == null) s = metas.get("DC.description");
if (s == null) s = "";
if (s.length() == 0) {
return getTitle().toLowerCase().split(splitrex);
@ -499,4 +508,5 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return scraper;
}
}
}

@ -110,7 +110,7 @@ public interface Parser {
* @return a {@link Hashtable} containing a list of MimeTypes that are supported by
* the parser
*/
public Hashtable getSupportedMimeTypes();
public Hashtable<String, String> getSupportedMimeTypes();
/**
* This function should be called before reusing the parser object.

@ -91,7 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
this.parserName = "OASIS OpenDocument V2 Text Document Parser";
}
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@ -108,7 +108,7 @@ public class odtParser extends AbstractParser implements Parser {
// opening the file as zip file
ZipFile zipFile= new ZipFile(dest);
Enumeration zipEnum = zipFile.entries();
Enumeration<? extends ZipEntry> zipEnum = zipFile.entries();
// looping through all containing files
while (zipEnum.hasMoreElements()) {

@ -86,7 +86,7 @@ public class pdfParser extends AbstractParser implements Parser {
this.parserName = "Acrobat Portable Document Parser";
}
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -132,7 +132,7 @@ public class pptParser extends AbstractParser implements Parser {
}
}
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -91,7 +91,7 @@ public class psParser extends AbstractParser implements Parser {
}
}
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -88,7 +88,7 @@ public class rpmParser extends AbstractParser implements Parser {
this.parserName = "rpm Parser";
}
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@ -110,7 +110,7 @@ public class rpmParser extends AbstractParser implements Parser {
RPMFile rpmFile = null;
try {
String summary = null, description = null, packager = null, name = sourceFile.getName();
HashMap<String, String> anchors = new HashMap<String, String>();
HashMap<yacyURL, String> anchors = new HashMap<yacyURL, String>();
StringBuffer content = new StringBuffer();
// opening the rpm file
@ -138,7 +138,7 @@ public class rpmParser extends AbstractParser implements Parser {
else if (headerNames[i].equalsIgnoreCase("SUMMARY")) summary = tag.toString();
else if (headerNames[i].equalsIgnoreCase("DESCRIPTION")) description = tag.toString();
else if (headerNames[i].equalsIgnoreCase("PACKAGER")) packager = tag.toString();
else if (headerNames[i].equalsIgnoreCase("URL")) anchors.put(tag.toString(), tag.toString());
else if (headerNames[i].equalsIgnoreCase("URL")) anchors.put(new yacyURL(tag.toString(), null), tag.toString());
}
// closing the rpm file

@ -96,7 +96,7 @@ public class rssParser extends AbstractParser implements Parser {
try {
LinkedList<String> feedSections = new LinkedList<String>();
HashMap<String, String> anchors = new HashMap<String, String>();
HashMap<yacyURL, String> anchors = new HashMap<yacyURL, String>();
TreeSet<htmlFilterImageEntry> images = new TreeSet<htmlFilterImageEntry>();
serverByteBuffer text = new serverByteBuffer();
serverCharBuffer authors = new serverCharBuffer();
@ -132,7 +132,7 @@ public class rssParser extends AbstractParser implements Parser {
if (itemCreator != null && itemCreator.length() > 0) authors.append(",").append(itemCreator);
feedSections.add(itemTitle);
anchors.put(itemURL.toString(),itemTitle);
anchors.put(itemURL, itemTitle);
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
text.append(new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))).trim().toString()).append(' ');
@ -149,7 +149,7 @@ public class rssParser extends AbstractParser implements Parser {
feedSections.add(itemHeadline);
}
Map<String, String> itemLinks = scraper.getAnchors();
Map<yacyURL, String> itemLinks = scraper.getAnchors();
if ((itemLinks != null) && (itemLinks.size() > 0)) {
anchors.putAll(itemLinks);
}
@ -191,7 +191,7 @@ public class rssParser extends AbstractParser implements Parser {
}
}
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -116,7 +116,7 @@ public class rtfParser extends AbstractParser implements Parser {
}
}
public java.util.Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return rtfParser.SUPPORTED_MIME_TYPES;
}

@ -48,6 +48,9 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import SevenZip.ArchiveExtractCallback;
import SevenZip.Archive.IInArchive;
import SevenZip.Archive.SevenZipEntry;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
@ -56,10 +59,6 @@ import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
import SevenZip.ArchiveExtractCallback;
import SevenZip.Archive.IInArchive;
import SevenZip.Archive.SevenZipEntry;
// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
// and parse the extracted content
public class SZParserExtractCallback extends ArchiveExtractCallback {
@ -117,7 +116,7 @@ public class SZParserExtractCallback extends ArchiveExtractCallback {
plasmaParserDocument theDoc;
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
yacyURL url = yacyURL.newURL(doc.getLocation(), this.prefix + "/" + super.filePath);
yacyURL url = yacyURL.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
String mime = plasmaParser.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
if (this.cfos.isFallback()) {
theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentFile());
@ -126,18 +125,20 @@ public class SZParserExtractCallback extends ArchiveExtractCallback {
}
// revert the above workaround
Map<String, String> nanchors = new HashMap<String, String>(theDoc.getAnchors().size(), 1f);
Iterator it = theDoc.getAnchors().entrySet().iterator();
Map.Entry entry;
String base = doc.getLocation().toNormalform(false, true);
Map<yacyURL, String> nanchors = new HashMap<yacyURL, String>(theDoc.getAnchors().size(), 1f);
Iterator<Map.Entry<yacyURL, String>> it = theDoc.getAnchors().entrySet().iterator();
Map.Entry<yacyURL, String> entry;
String base = doc.dc_source().toNormalform(false, true);
String u;
while (it.hasNext()) {
entry = (Map.Entry)it.next();
if (((String)entry.getKey()).startsWith(base + "/")) {
String ref = "#" + ((String)entry.getKey()).substring(base.length() + 1);
entry = it.next();
u = entry.getKey().toNormalform(true, true);
if (u.startsWith(base + "/")) {
String ref = "#" + u.substring(base.length() + 1);
this.log.logFinest("changing " + entry.getKey() + " to use reference " + ref);
nanchors.put(base + ref, (String)entry.getValue());
nanchors.put(new yacyURL(base + ref, null), entry.getValue());
} else {
nanchors.put((String)entry.getKey(), (String)entry.getValue());
nanchors.put(entry.getKey(), entry.getValue());
}
}
theDoc.getAnchors().clear();

@ -50,7 +50,6 @@ import java.util.Hashtable;
import SevenZip.IInStream;
import SevenZip.MyRandomAccessFile;
import SevenZip.Archive.SevenZip.Handler;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
@ -140,7 +139,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
}
}
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
}

@ -44,11 +44,10 @@
package de.anomic.plasma.parser.swf;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.HashMap;
import java.util.Hashtable;
import pt.tumba.parser.swf.*;
import pt.tumba.parser.swf.SWF2HTML;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
@ -82,7 +81,7 @@ public class swfParser extends AbstractParser implements Parser {
/**
* returns a hashtable containing the mimetypes that are supported by this class
*/
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@ -101,7 +100,7 @@ public class swfParser extends AbstractParser implements Parser {
String[] sections = null;
String abstrct = null;
//TreeSet images = null;
HashMap<String, String> anchors = new HashMap<String, String>();
HashMap<yacyURL, String> anchors = new HashMap<yacyURL, String>();
int urls = 0;
int urlStart = -1;
int urlEnd = 0;
@ -118,7 +117,7 @@ public class swfParser extends AbstractParser implements Parser {
urlEnd = contents.indexOf(linebreak,urlStart);
url = contents.substring(urlStart,urlEnd);
urlnr = (new Integer(++urls)).toString();
anchors.put(url,urlnr);
anchors.put(new yacyURL(url, null), urlnr);
contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
}

@ -94,7 +94,7 @@ public class tarParser extends AbstractParser implements Parser {
this.parserName = "Tape Archive File Parser";
}
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@ -131,7 +131,7 @@ public class tarParser extends AbstractParser implements Parser {
LinkedList<String> docSections = new LinkedList<String>();
StringBuffer docAbstrct = new StringBuffer();
Map<String, String> docAnchors = new HashMap<String, String>();
Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>();
// looping through the contained files
@ -177,15 +177,15 @@ public class tarParser extends AbstractParser implements Parser {
// merging all documents together
if (docKeywords.length() > 0) docKeywords.append(",");
docKeywords.append(subDoc.getKeywords(','));
docKeywords.append(subDoc.dc_subject(','));
if (docLongTitle.length() > 0) docLongTitle.append("\n");
docLongTitle.append(subDoc.getTitle());
docLongTitle.append(subDoc.dc_title());
docSections.addAll(Arrays.asList(subDoc.getSectionTitles()));
if (docAbstrct.length() > 0) docAbstrct.append("\n");
docAbstrct.append(subDoc.getAbstract());
docAbstrct.append(subDoc.dc_description());
if (subDoc.getTextLength() > 0) {
if (docTextLength > 0) docText.write('\n');

@ -101,7 +101,7 @@ public class vcfParser extends AbstractParser implements Parser {
StringBuffer parsedTitle = new StringBuffer();
StringBuffer parsedDataText = new StringBuffer();
HashMap<String, String> parsedData = new HashMap<String, String>();
HashMap<String, String> anchors = new HashMap<String, String>();
HashMap<yacyURL, String> anchors = new HashMap<yacyURL, String>();
LinkedList<String> parsedNames = new LinkedList<String>();
boolean useLastLine = false;
@ -211,7 +211,7 @@ public class vcfParser extends AbstractParser implements Parser {
} else if (key.toUpperCase().startsWith("URL")) {
try {
yacyURL newURL = new yacyURL(value, null);
anchors.put(newURL.toString(),newURL.toString());
anchors.put(newURL, newURL.toString());
//parsedData.put(key,value);
} catch (MalformedURLException ex) {/* ignore this */}
} else if (

@ -162,7 +162,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener {
}
}
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -43,7 +43,6 @@
package de.anomic.plasma.parser.zip;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
@ -58,6 +57,7 @@ import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
@ -114,7 +114,7 @@ public class zipParser extends AbstractParser implements Parser {
StringBuffer docLongTitle = new StringBuffer();
LinkedList<String> docSections = new LinkedList<String>();
StringBuffer docAbstrct = new StringBuffer();
Map<String, String> docAnchors = new HashMap<String, String>();
Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>();
// creating a new parser class to parse the unzipped content
@ -160,15 +160,15 @@ public class zipParser extends AbstractParser implements Parser {
// merging all documents together
if (docKeywords.length() > 0) docKeywords.append(",");
docKeywords.append(subDoc.getKeywords(','));
docKeywords.append(subDoc.dc_subject(','));
if (docLongTitle.length() > 0) docLongTitle.append("\n");
docLongTitle.append(subDoc.getTitle());
docLongTitle.append(subDoc.dc_title());
docSections.addAll(Arrays.asList(subDoc.getSectionTitles()));
if (docAbstrct.length() > 0) docAbstrct.append("\n");
docAbstrct.append(subDoc.getAbstract());
docAbstrct.append(subDoc.dc_description());
if (subDoc.getTextLength() > 0) {
if (docTextLength > 0) docText.write('\n');

@ -73,6 +73,7 @@ import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
public final class plasmaCondenser {
@ -130,9 +131,9 @@ public final class plasmaCondenser {
//System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));
insertTextToWords(document.getLocation().toNormalform(false, true), 0, indexRWIEntry.flag_app_url, RESULT_FLAGS);
insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_url, RESULT_FLAGS);
Map.Entry entry;
Map.Entry<yacyURL, String> entry;
if (indexText) {
createCondensement(document.getText(), document.getCharset());
// the phrase counter:
@ -147,9 +148,9 @@ public final class plasmaCondenser {
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(document.getTitle(), 1, indexRWIEntry.flag_app_descr, RESULT_FLAGS);
insertTextToWords(document.getAbstract(), 3, indexRWIEntry.flag_app_descr, RESULT_FLAGS);
insertTextToWords(document.getAuthor(), 4, indexRWIEntry.flag_app_descr, RESULT_FLAGS);
insertTextToWords(document.dc_title(), 1, indexRWIEntry.flag_app_descr, RESULT_FLAGS);
insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_descr, RESULT_FLAGS);
insertTextToWords(document.dc_creator(), 4, indexRWIEntry.flag_app_descr, RESULT_FLAGS);
// missing: tags!
String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {
@ -157,10 +158,10 @@ public final class plasmaCondenser {
}
// anchors
Iterator i = document.getAnchors().entrySet().iterator();
Iterator<Map.Entry<yacyURL, String>> i = document.getAnchors().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 98, indexRWIEntry.flag_app_reference, RESULT_FLAGS);
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 98, indexRWIEntry.flag_app_reference, RESULT_FLAGS);
insertTextToWords((String) entry.getValue(), 98, indexRWIEntry.flag_app_reference, RESULT_FLAGS);
}
} else {
@ -172,45 +173,45 @@ public final class plasmaCondenser {
if (indexMedia) {
// audio
Iterator i = document.getAudiolinks().entrySet().iterator();
Iterator<Map.Entry<yacyURL, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, RESULT_FLAGS);
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS);
}
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, RESULT_FLAGS);
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS);
}
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, RESULT_FLAGS);
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS);
}
// images
i = document.getImages().iterator();
Iterator<htmlFilterImageEntry> j = document.getImages().iterator();
htmlFilterImageEntry ientry;
while (i.hasNext()) {
ientry = (htmlFilterImageEntry) i.next();
insertTextToWords(ientry.url().toNormalform(false, true), 99, flag_cat_hasimage, RESULT_FLAGS);
while (j.hasNext()) {
ientry = j.next();
insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS);
}
// finally check all words for missing flag entry
Iterator<Map.Entry<String, wordStatProp>> j = words.entrySet().iterator();
Iterator<Map.Entry<String, wordStatProp>> k = words.entrySet().iterator();
wordStatProp wprop;
Map.Entry<String, wordStatProp> we;
while (j.hasNext()) {
we = j.next();
wprop = (wordStatProp) we.getValue();
while (k.hasNext()) {
we = k.next();
wprop = we.getValue();
if (wprop.flags == null) {
wprop.flags = (kelondroBitfield) RESULT_FLAGS.clone();
words.put(we.getKey(), wprop);
@ -305,19 +306,19 @@ public final class plasmaCondenser {
public static class wordStatProp {
// object carries statistics for words and sentences
public int count; // number of occurrences
public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
public int posInPhrase; // position of word in phrase
public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
public HashSet hash; // a set of handles to all sentences where this word appears
public kelondroBitfield flags; // the flag bits for each word
public int count; // number of occurrences
public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
public int posInPhrase; // position of word in phrase
public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
private HashSet<Integer> hash; // a set of handles to all sentences where this word appears
public kelondroBitfield flags; // the flag bits for each word
public wordStatProp(int handle, int pip, int nop) {
this.count = 1;
this.posInText = handle;
this.posInPhrase = pip;
this.numOfPhrase = nop;
this.hash = new HashSet();
this.hash = new HashSet<Integer>();
this.flags = null;
}
@ -326,7 +327,7 @@ public final class plasmaCondenser {
}
public void check(int i) {
hash.add(Integer.toString(i));
hash.add(new Integer(i));
}
}
@ -334,14 +335,14 @@ public final class plasmaCondenser {
public static class phraseStatProp {
// object carries statistics for words and sentences
public int count; // number of occurrences
public int handle; // unique handle, is initialized with sentence counter
public HashSet hash; //
public int count; // number of occurrences
public int handle; // unique handle, is initialized with sentence counter
private HashSet<Integer> hash; //
public phraseStatProp(int handle) {
this.count = 1;
this.handle = handle;
this.hash = new HashSet();
this.hash = new HashSet<Integer>();
}
public void inc() {
@ -349,7 +350,7 @@ public final class plasmaCondenser {
}
public void check(int i) {
hash.add(Integer.toString(i));
hash.add(new Integer(i));
}
}
@ -362,7 +363,7 @@ public final class plasmaCondenser {
}
private void createCondensement(InputStream is, String charset) throws UnsupportedEncodingException {
HashSet currsentwords = new HashSet();
HashSet<String> currsentwords = new HashSet<String>();
StringBuffer sentence = new StringBuffer(100);
String word = "";
String k;
@ -376,7 +377,6 @@ public final class plasmaCondenser {
int allsentencecounter = 0;
int idx;
int wordInSentenceCounter = 1;
Iterator it, it1;
boolean comb_indexof = false, last_last = false, last_index = false;
RandomAccessFile fa;
final boolean dumpWords = false;
@ -405,6 +405,7 @@ public final class plasmaCondenser {
// distinguish punctuation and words
wordlen = word.length();
Iterator<String> it;
if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) {
// store sentence
if (sentence.length() > 0) {
@ -493,9 +494,9 @@ public final class plasmaCondenser {
String[] s;
int wc;
Object o;
it = sentences.keySet().iterator();
while (it.hasNext()) {
o = it.next();
Iterator<StringBuffer> sit = sentences.keySet().iterator();
while (sit.hasNext()) {
o = sit.next();
if (o != null) {
sentence = (StringBuffer) o;
wc = (sentence.length() - 1) / numlength;
@ -511,15 +512,15 @@ public final class plasmaCondenser {
}
}
Map.Entry entry;
Map.Entry<String, wordStatProp> entry;
// we search for similar words and reorganize the corresponding sentences
// a word is similar, if a shortened version is equal
it = words.entrySet().iterator(); // enumerates the keys in descending order
wordsearch: while (it.hasNext()) {
entry = (Map.Entry) it.next();
word = (String) entry.getKey();
Iterator<Map.Entry<String, wordStatProp>> wi = words.entrySet().iterator(); // enumerates the keys in descending order
wordsearch: while (wi.hasNext()) {
entry = wi.next();
word = entry.getKey();
wordlen = word.length();
wsp = (wordStatProp) entry.getValue();
wsp = entry.getValue();
for (int i = wordcut; i > 0; i--) {
if (wordlen > i) {
k = word.substring(0, wordlen - i);
@ -528,9 +529,9 @@ public final class plasmaCondenser {
// corresponding links
// in sentences that use this word
wsp1 = (wordStatProp) words.get(k);
it1 = wsp.hash.iterator(); // we iterate over all sentences that refer to this word
Iterator<Integer> it1 = wsp.hash.iterator(); // we iterate over all sentences that refer to this word
while (it1.hasNext()) {
idx = Integer.parseInt((String) it1.next()); // number of a sentence
idx = it1.next().intValue(); // number of a sentence
s = (String[]) orderedSentences[idx];
for (int j = 2; j < s.length; j++) {
if (s[j].equals(intString(wsp.posInText, numlength)))
@ -542,7 +543,7 @@ public final class plasmaCondenser {
wsp1.count = wsp1.count + wsp.count;
words.put(k, wsp1);
// remove current word
it.remove();
wi.remove();
continue wordsearch;
}
}
@ -550,8 +551,8 @@ public final class plasmaCondenser {
}
// depending on the orderedSentences structure, we rebuild the sentence
// HashMap to eliminate double occuring sentences
sentences = new HashMap();
// HashMap to eliminate double occurring sentences
sentences = new HashMap<StringBuffer, phraseStatProp>();
int le;
for (int i = 0; i < orderedSentences.length; i++) {
le = ((String[]) orderedSentences[i]).length;
@ -560,7 +561,7 @@ public final class plasmaCondenser {
sentence.append(((String[]) orderedSentences[i])[j]);
if (sentences.containsKey(sentence)) {
// add sentence counter to counter of found sentence
psp = (phraseStatProp) sentences.get(sentence);
psp = sentences.get(sentence);
psp.count = psp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]);
sentences.put(sentence, psp);
// System.out.println("Found double occurring sentence " + i + "
@ -596,14 +597,14 @@ public final class plasmaCondenser {
// this structure is only needed to reconstruct the text
String word;
wordStatProp wsp;
Map.Entry entry;
Iterator it;
Map.Entry<String, wordStatProp> entry;
Iterator<Map.Entry<String, wordStatProp>> it;
String[] orderedWords = new String[words.size() + 99]; // uuiiii, the '99' is only a quick hack...
it = words.entrySet().iterator(); // enumerates the keys in ascending order
while (it.hasNext()) {
entry = (Map.Entry) it.next();
word = (String) entry.getKey();
wsp = (wordStatProp) entry.getValue();
entry = it.next();
word = entry.getKey();
wsp = entry.getValue();
orderedWords[wsp.posInText] = word;
}
@ -632,14 +633,14 @@ public final class plasmaCondenser {
// we reconstruct the sentence hashtable again and create by-handle ordered entries
// this structure is needed to present the strings in the right order in a printout
int wc;
Iterator it;
phraseStatProp psp;
String[] s;
StringBuffer sentence;
Object[] orderedSentences = new Object[sentences.size()];
for (int i = 0; i < sentences.size(); i++)
for (int i = 0; i < sentences.size(); i++) {
orderedSentences[i] = null; // this array must be initialized
it = sentences.keySet().iterator();
}
Iterator<StringBuffer> it = sentences.keySet().iterator();
while (it.hasNext()) {
sentence = (StringBuffer) it.next();
wc = (sentence.length() - 1) / numlength;
@ -717,9 +718,9 @@ public final class plasmaCondenser {
}
private static class unsievedWordsEnum implements Enumeration {
private static class unsievedWordsEnum implements Enumeration<StringBuffer> {
// returns an enumeration of StringBuffer Objects
Object buffer = null;
StringBuffer buffer = null;
sentencesFromInputStreamEnum e;
StringBuffer s;
@ -770,8 +771,8 @@ public final class plasmaCondenser {
return buffer != null;
}
public Object nextElement() {
Object r = buffer;
public StringBuffer nextElement() {
StringBuffer r = buffer;
buffer = nextElement0();
return r;
}

@ -91,18 +91,18 @@ public class plasmaCrawlProfile {
return profileTable.size();
}
public Iterator profiles(boolean up) {
public Iterator<entry> profiles(boolean up) {
// enumerates profile entries
try {
return new profileIterator(up);
} catch (IOException e) {
return new HashSet().iterator();
return new HashSet<entry>().iterator();
}
}
public class profileIterator implements Iterator {
public class profileIterator implements Iterator<entry> {
// the iterator iterates all keys, which are byte[] objects
kelondroCloneableIterator handleIterator;
kelondroCloneableIterator<String> handleIterator;
String lastkey;
public profileIterator(boolean up) throws IOException {
handleIterator = profileTable.keys(up, false);
@ -116,7 +116,7 @@ public class plasmaCrawlProfile {
return false;
}
}
public Object next() {
public entry next() {
try {
lastkey = (String) handleIterator.next();
return getEntry(lastkey);
@ -140,7 +140,7 @@ public class plasmaCrawlProfile {
} catch (IOException e) {}
}
public entry newEntry(Map mem) {
public entry newEntry(Map<String, String> mem) {
entry ne = new entry(mem);
try {
profileTable.set(ne.handle(), ne.map());
@ -466,19 +466,19 @@ public class plasmaCrawlProfile {
}
public String domName(boolean attr, int index){
Iterator domnamesi = doms.entrySet().iterator();
Iterator<Map.Entry<String, DomProfile>> domnamesi = doms.entrySet().iterator();
String domname="";
Map.Entry ey;
Map.Entry<String, DomProfile> ey;
DomProfile dp;
int i = 0;
while ((domnamesi.hasNext()) && (i < index)) {
ey = (Map.Entry) domnamesi.next();
ey = domnamesi.next();
i++;
}
if(domnamesi.hasNext()){
ey = (Map.Entry) domnamesi.next();
dp = (DomProfile) ey.getValue();
domname = ((String) ey.getKey()) + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " ");
ey = domnamesi.next();
dp = ey.getValue();
domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " ");
}
return domname;
}

@ -432,12 +432,13 @@ public final class plasmaParser {
String fullClassName = plasmaParserPkgName + "." + currentDir.getName() + "." + className;
try {
// trying to load the parser class by its name
Class parserClass = Class.forName(fullClassName);
Object theParser = parserClass.newInstance();
if (!(theParser instanceof Parser)) continue;
Class<?> parserClass = Class.forName(fullClassName);
Object theParser0 = (Parser) parserClass.newInstance();
if (!(theParser0 instanceof Parser)) continue;
Parser theParser = (Parser) theParser0;
// testing if all needed libx libraries are available
String[] neededLibx = ((Parser)theParser).getLibxDependences();
String[] neededLibx = theParser.getLibxDependences();
StringBuffer neededLibxBuf = new StringBuffer();
if (neededLibx != null) {
for (int libxId=0; libxId < neededLibx.length; libxId++) {
@ -451,7 +452,7 @@ public final class plasmaParser {
}
// loading the list of mime-types that are supported by this parser class
Hashtable<String, String> supportedMimeTypes = ((Parser) theParser).getSupportedMimeTypes();
Hashtable<String, String> supportedMimeTypes = theParser.getSupportedMimeTypes();
// creating a parser info object
ParserInfo parserInfo = new ParserInfo();
@ -462,7 +463,7 @@ public final class plasmaParser {
parserInfo.parserVersionNr = ((Parser)theParser).getVersion();
parserInfo.parserName = ((Parser) theParser).getName();
Iterator mimeTypeIterator = supportedMimeTypes.keySet().iterator();
Iterator<String> mimeTypeIterator = supportedMimeTypes.keySet().iterator();
while (mimeTypeIterator.hasNext()) {
String mimeType = (String) mimeTypeIterator.next();
availableParserList.put(mimeType, parserInfo);
@ -490,9 +491,9 @@ public final class plasmaParser {
public void close() {
// clearing the parser list
Iterator configs = parserConfigList.values().iterator();
Iterator<plasmaParserConfig> configs = parserConfigList.values().iterator();
while (configs.hasNext()) {
plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
plasmaParserConfig currentConfig = configs.next();
synchronized (currentConfig.enabledParserList) {
currentConfig.enabledParserList.clear();
}
@ -684,29 +685,24 @@ public final class plasmaParser {
}
public plasmaParserDocument transformScraper(yacyURL location, String mimeType, String charSet, htmlFilterContentScraper scraper) {
try {
String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
plasmaParserDocument ppd = new plasmaParserDocument(
new yacyURL(location.toNormalform(true, true), null),
mimeType,
charSet,
scraper.getKeywords(),
scraper.getTitle(),
scraper.getAuthor(),
sections,
scraper.getDescription(),
scraper.getText(),
scraper.getAnchors(),
scraper.getImages());
//scraper.close();
ppd.setFavicon(scraper.getFavicon());
return ppd;
} catch (MalformedURLException e) {
//e.printStackTrace();
return null;
}
String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
plasmaParserDocument ppd = new plasmaParserDocument(
location,
mimeType,
charSet,
scraper.getKeywords(),
scraper.getTitle(),
scraper.getAuthor(),
sections,
scraper.getDescription(),
scraper.getText(),
scraper.getAnchors(),
scraper.getImages());
//scraper.close();
ppd.setFavicon(scraper.getFavicon());
return ppd;
}
/**
@ -737,7 +733,7 @@ public final class plasmaParser {
Parser theParser = makeParser(parserClassName);
// checking if the created parser really supports the given mimetype
Hashtable supportedMimeTypes = theParser.getSupportedMimeTypes();
Hashtable<String, String> supportedMimeTypes = theParser.getSupportedMimeTypes();
if ((supportedMimeTypes != null) && (supportedMimeTypes.containsKey(mimeType))) {
parserInfo.incUsageCounter();
return theParser;
@ -751,64 +747,73 @@ public final class plasmaParser {
}
static Map<String, String> allReflinks(Set links) {
static Map<yacyURL, String> allReflinks(Set<?> links) {
// links is either a Set of Strings (with urls) or htmlFilterImageEntries
// we find all links that are part of a reference inside a url
HashMap<String, String> v = new HashMap<String, String>();
Iterator i = links.iterator();
HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
Iterator<?> i = links.iterator();
Object o;
String url;
yacyURL url;
String u;
int pos;
loop: while (i.hasNext()) {
loop: while (i.hasNext()) try {
o = i.next();
if (o instanceof String) url = (String) o;
else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url().toNormalform(true, true);
if (o instanceof yacyURL) url = (yacyURL) o;
else if (o instanceof String) url = new yacyURL((String) o, null);
else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url();
else {
assert false;
continue;
}
if ((pos = url.toLowerCase().indexOf("http://",7)) > 0) {
u = url.toNormalform(true, true);
if ((pos = u.toLowerCase().indexOf("http://",7)) > 0) {
i.remove();
url = url.substring(pos);
while ((pos = url.toLowerCase().indexOf("http://",7)) > 0) url = url.substring(pos);
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("http://",7)) > 0) u = u.substring(pos);
url = new yacyURL(u, null);
if (!(v.containsKey(url))) v.put(url, "ref");
continue loop;
}
if ((pos = url.toLowerCase().indexOf("/www.",7)) > 0) {
if ((pos = u.toLowerCase().indexOf("/www.",7)) > 0) {
i.remove();
url = "http:/" + url.substring(pos);
while ((pos = url.toLowerCase().indexOf("/www.",7)) > 0) url = "http:/" + url.substring(pos);
u = "http:/" + u.substring(pos);
while ((pos = u.toLowerCase().indexOf("/www.",7)) > 0) u = "http:/" + u.substring(pos);
url = new yacyURL(u, null);
if (!(v.containsKey(url))) v.put(url, "ref");
continue loop;
}
}
} catch (MalformedURLException e) {}
return v;
}
static Map<String, String> allSubpaths(Set links) {
static Map<yacyURL, String> allSubpaths(Set<?> links) {
// links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries
HashMap<String, String> v = new HashMap<String, String>();
Iterator i = links.iterator();
HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
Iterator<?> i = links.iterator();
Object o;
String url;
yacyURL url;
String u;
int pos;
while (i.hasNext()) {
while (i.hasNext()) try {
o = i.next();
if (o instanceof String) url = (String) o;
else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url().toNormalform(true, true);
if (o instanceof yacyURL) url = (yacyURL) o;
else if (o instanceof String) url = new yacyURL((String) o, null);
else if (o instanceof htmlFilterImageEntry) url = ((htmlFilterImageEntry) o).url();
else {
assert false;
continue;
}
if (url.endsWith("/")) url = url.substring(0, url.length() - 1);
pos = url.lastIndexOf("/");
u = url.toNormalform(true, true);
if (u.endsWith("/")) u = u.substring(0, u.length() - 1);
pos = u.lastIndexOf("/");
while (pos > 8) {
url = url.substring(0, pos + 1);
u = u.substring(0, pos + 1);
url = new yacyURL(u, null);
if (!(v.containsKey(url))) v.put(url, "sub");
url = url.substring(0, pos);
pos = url.lastIndexOf("/");
u = u.substring(0, pos);
pos = u.lastIndexOf("/");
}
}
} catch (MalformedURLException e) {}
return v;
}
@ -883,24 +888,24 @@ public final class plasmaParser {
// printing out all parsed sentences
if (document != null) {
System.out.print("Document titel: ");
System.out.println(document.getTitle());
System.out.println(document.dc_title());
// found text
final Iterator sentences = document.getSentences(false);
final Iterator<StringBuffer> sentences = document.getSentences(false);
int i = 0;
if (sentences != null) while (sentences.hasNext()) {
System.out.print("line " + i + ": ");
System.out.println(((StringBuffer) sentences.next()).toString());
System.out.println(sentences.next().toString());
i++;
}
// found links
int anchorNr = 0;
Map anchors = document.getAnchors();
Iterator anchorIter = anchors.keySet().iterator();
Map<yacyURL, String> anchors = document.getAnchors();
Iterator<yacyURL> anchorIter = anchors.keySet().iterator();
while (anchorIter.hasNext()) {
String key = (String) anchorIter.next();
System.out.println("URL " + anchorNr + ":\t" + key + " | " + anchors.get(key));
yacyURL key = anchorIter.next();
System.out.println("URL " + anchorNr + ":\t" + key.toString() + " | " + anchors.get(key));
anchorNr++;
}
document.close();
@ -913,9 +918,9 @@ public final class plasmaParser {
public static boolean supportedContent(yacyURL url, String mimeType) {
if (url == null) throw new NullPointerException();
Iterator configs = parserConfigList.values().iterator();
Iterator<plasmaParserConfig> configs = parserConfigList.values().iterator();
while (configs.hasNext()) {
plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
plasmaParserConfig currentConfig = configs.next();
synchronized (currentConfig.enabledParserList) {
if (currentConfig.supportedContent(url, mimeType)) return true;
}
@ -944,7 +949,7 @@ public final class plasmaParser {
config.initParseableMimeTypes(configStr);
}
public static String[] setEnabledParserList(String parserMode, Set mimeTypeSet) {
public static String[] setEnabledParserList(String parserMode, Set<String> mimeTypeSet) {
if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
@ -956,9 +961,9 @@ public final class plasmaParser {
}
public static boolean supportedFileExtContains(String fileExt) {
Iterator configs = parserConfigList.values().iterator();
Iterator<plasmaParserConfig> configs = parserConfigList.values().iterator();
while (configs.hasNext()) {
plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
plasmaParserConfig currentConfig = configs.next();
synchronized (currentConfig.enabledParserList) {
if (currentConfig.supportedFileExtContains(fileExt)) return true;
}
@ -968,9 +973,9 @@ public final class plasmaParser {
}
public static boolean supportedMimeTypesContains(String mimeType) {
Iterator configs = parserConfigList.values().iterator();
Iterator<plasmaParserConfig> configs = parserConfigList.values().iterator();
while (configs.hasNext()) {
plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
plasmaParserConfig currentConfig = configs.next();
synchronized (currentConfig.enabledParserList) {
if (currentConfig.supportedMimeTypesContains(mimeType)) return true;
}
@ -985,7 +990,7 @@ public final class plasmaParser {
throw new IllegalArgumentException("The object key must be of type string.");
// loading class by name
Class moduleClass = Class.forName((String)name);
Class<?> moduleClass = Class.forName((String)name);
// instantiating class
Parser theParser = (Parser) moduleClass.newInstance();

@ -65,12 +65,12 @@ public class plasmaParserConfig {
* @see #loadEnabledParserList()
* @see #setEnabledParserList(Enumeration)
*/
final HashSet enabledParserList = new HashSet();
final HashSet<String> enabledParserList = new HashSet<String>();
/**
* A list of file extensions that are supported by all enabled parsers
*/
final HashSet supportedFileExt = new HashSet();
final HashSet<String> supportedFileExt = new HashSet<String>();
/**
* Parsermode this configuration belongs to
@ -134,29 +134,29 @@ public class plasmaParserConfig {
}
public void initParseableMimeTypes(String enabledMimeTypes) {
HashSet mimeTypes = null;
HashSet<String> mimeTypes = null;
if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) {
mimeTypes = new HashSet();
mimeTypes = new HashSet<String>();
} else {
String[] enabledMimeTypeList = enabledMimeTypes.split(",");
mimeTypes = new HashSet(enabledMimeTypeList.length);
mimeTypes = new HashSet<String>(enabledMimeTypeList.length);
for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim());
}
setEnabledParserList(mimeTypes);
}
public void enableAllParsers() {
Set availableMimeTypes = plasmaParser.availableParserList.keySet();
Set<String> availableMimeTypes = plasmaParser.availableParserList.keySet();
setEnabledParserList(availableMimeTypes);
}
public String[] setEnabledParserList(Set mimeTypeSet) {
public String[] setEnabledParserList(Set<String> mimeTypeSet) {
HashSet newEnabledParsers = new HashSet();
HashSet newSupportedFileExt = new HashSet();
HashSet<String> newEnabledParsers = new HashSet<String>();
HashSet<String> newSupportedFileExt = new HashSet<String>();
if (mimeTypeSet != null) {
Iterator mimeTypes = mimeTypeSet.iterator();
Iterator<String> mimeTypes = mimeTypeSet.iterator();
while (mimeTypes.hasNext()) {
String mimeType = (String) mimeTypes.next();
if (plasmaParser.availableParserList.containsKey(mimeType)) {
@ -166,7 +166,7 @@ public class plasmaParserConfig {
theParser = plasmaParser.makeParser(((ParserInfo)plasmaParser.availableParserList.get(mimeType)).parserClassName);
// getting a list of mimeTypes that the parser supports
Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
Hashtable<String, String> parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
if (parserSupportsMimeTypes != null) {
Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
if ((supportedExtensions != null) &&
@ -202,9 +202,10 @@ public class plasmaParserConfig {
return (String[])newEnabledParsers.toArray(new String[newEnabledParsers.size()]);
}
public HashSet getEnabledParserList() {
@SuppressWarnings("unchecked")
public HashSet<String> getEnabledParserList() {
synchronized (this.enabledParserList) {
return (HashSet) this.enabledParserList.clone();
return (HashSet<String>) this.enabledParserList.clone();
}
}
}

@ -48,7 +48,6 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.server.serverFileUtils;
@ -67,21 +66,21 @@ import de.anomic.plasma.parser.Parser;
public class plasmaParserDocument {
private yacyURL location; // the source url
private yacyURL source; // the source url
private String mimeType; // mimeType as taken from http header
private String charset; // the charset of the document
private List<String> keywords; // most resources provide a keyword field
private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result
private StringBuffer author; // author or copyright
private StringBuffer creator; // author or copyright
private List<String> sections; // if present: more titles/headlines appearing in the document
private StringBuffer abstrct; // an abstract, if present: short content description
private StringBuffer description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private Map<String, String> anchors; // all links embedded as clickeable entities (anchor tags)
private Map<yacyURL, String> anchors; // all links embedded as clickeable entities (anchor tags)
private TreeSet<htmlFilterImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
private Map<String, String> hyperlinks, audiolinks, videolinks, applinks;
private Map<yacyURL, String> hyperlinks, audiolinks, videolinks, applinks;
private Map<String, String> emaillinks;
private yacyURL favicon;
private boolean resorted;
@ -90,16 +89,16 @@ public class plasmaParserDocument {
protected plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
Object text, Map<String, String> anchors, TreeSet<htmlFilterImageEntry> images) {
this.location = location;
Object text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
this.keywords = (keywords == null) ? new LinkedList<String>() : Arrays.asList(keywords);
this.title = (title == null) ? new StringBuffer() : new StringBuffer(title);
this.author = (author == null) ? new StringBuffer() : new StringBuffer(author);
this.creator = (author == null) ? new StringBuffer() : new StringBuffer(author);
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
this.abstrct = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct);
this.anchors = (anchors == null) ? new HashMap<String, String>(0) : anchors;
this.description = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct);
this.anchors = (anchors == null) ? new HashMap<yacyURL, String>(0) : anchors;
this.images = (images == null) ? new TreeSet<htmlFilterImageEntry>() : images;
this.hyperlinks = null;
this.audiolinks = null;
@ -125,32 +124,90 @@ public class plasmaParserDocument {
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
byte[] text, Map<String, String> anchors, TreeSet<htmlFilterImageEntry> images) {
byte[] text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
File text, Map<String, String> anchors, TreeSet<htmlFilterImageEntry> images) {
File text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
serverCachedFileOutputStream text, Map<String, String> anchors, TreeSet<htmlFilterImageEntry> images) {
serverCachedFileOutputStream text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public yacyURL getLocation() {
return this.location;
/*
DC according to rfc 5013
* dc_title
* dc_creator
* dc_subject
* dc_description
* dc_publisher
dc_contributor
dc_date
dc_type
* dc_format
* dc_identifier
* dc_source
dc_language
dc_relation
dc_coverage
dc_rights
*/
public String dc_title() {
return title.toString();
}
public String dc_creator() {
if (creator != null) return creator.toString(); else return new String();
}
public String getMimeType() {
public String dc_subject(char separator) {
// sort out doubles and empty words
TreeSet<String> hs = new TreeSet<String>();
String s;
for (int i = 0; i < this.keywords.size(); i++) {
if (this.keywords.get(i) == null) continue;
s = ((String)this.keywords.get(i)).trim();
if (s.length() > 0) hs.add(s.toLowerCase());
}
if (hs.size() == 0) return "";
// generate a new list
StringBuffer sb = new StringBuffer(this.keywords.size() * 6);
Iterator<String> i = hs.iterator();
while (i.hasNext()) sb.append(i.next()).append(separator);
return sb.substring(0, sb.length() - 1);
}
public String dc_description() {
if (description != null) return description.toString(); else return dc_title();
}
public String dc_publisher() {
// if we don't have a publisher, simply return the host/domain name
return this.source.getHost();
}
public String dc_format() {
return this.mimeType;
}
public String dc_identifier() {
return "yacy.net:" + this.source.hash();
}
public yacyURL dc_source() {
return this.source;
}
/**
* @return the supposed charset of this document or <code>null</code> if unknown
*/
@ -158,26 +215,14 @@ public class plasmaParserDocument {
return this.charset;
}
public String getTitle() {
return title.toString();
}
public String[] getSectionTitles() {
if (sections != null) {
return (String[])sections.toArray(new String[this.sections.size()]);
} else {
return new String[] { getTitle() };
return new String[] { dc_title() };
}
}
public String getAbstract() {
if (abstrct != null) return abstrct.toString(); else return getTitle();
}
public String getAuthor() {
if (author != null) return author.toString(); else return new String();
}
public InputStream getText() {
try {
if (this.text == null) return null;
@ -236,28 +281,11 @@ public class plasmaParserDocument {
return e;
}
public String getKeywords(char separator) {
// sort out doubles and empty words
TreeSet<String> hs = new TreeSet<String>();
String s;
for (int i = 0; i < this.keywords.size(); i++) {
if (this.keywords.get(i) == null) continue;
s = ((String)this.keywords.get(i)).trim();
if (s.length() > 0) hs.add(s.toLowerCase());
}
if (hs.size() == 0) return "";
// generate a new list
StringBuffer sb = new StringBuffer(this.keywords.size() * 6);
Iterator<String> i = hs.iterator();
while (i.hasNext()) sb.append(i.next()).append(separator);
return sb.substring(0, sb.length() - 1);
}
public List<String> getKeywords() {
return this.keywords;
}
public Map<String, String> getAnchors() {
public Map<yacyURL, String> getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return anchors;
@ -266,18 +294,18 @@ public class plasmaParserDocument {
// the next three methods provide a calculated view on the getAnchors/getImages:
public Map<String, String> getHyperlinks() {
public Map<yacyURL, String> getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (!resorted) resortLinks();
return hyperlinks;
}
public Map<String, String> getAudiolinks() {
public Map<yacyURL, String> getAudiolinks() {
if (!resorted) resortLinks();
return this.audiolinks;
}
public Map<String, String> getVideolinks() {
public Map<yacyURL, String> getVideolinks() {
if (!resorted) resortLinks();
return this.videolinks;
}
@ -289,7 +317,7 @@ public class plasmaParserDocument {
return images;
}
public Map<String, String> getApplinks() {
public Map<yacyURL, String> getApplinks() {
if (!resorted) resortLinks();
return this.applinks;
}
@ -307,17 +335,19 @@ public class plasmaParserDocument {
String u;
int extpos, qpos;
String ext = null;
Iterator<Map.Entry<String, String>> i = anchors.entrySet().iterator();
hyperlinks = new HashMap<String, String>();
videolinks = new HashMap<String, String>();
audiolinks = new HashMap<String, String>();
applinks = new HashMap<String, String>();
Iterator<Map.Entry<yacyURL, String>> i = anchors.entrySet().iterator();
hyperlinks = new HashMap<yacyURL, String>();
videolinks = new HashMap<yacyURL, String>();
audiolinks = new HashMap<yacyURL, String>();
applinks = new HashMap<yacyURL, String>();
emaillinks = new HashMap<String, String>();
TreeSet<htmlFilterImageEntry> collectedImages = new TreeSet<htmlFilterImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry<String, String> entry;
Map.Entry<yacyURL, String> entry;
while (i.hasNext()) {
entry = i.next();
u = entry.getKey();
url = entry.getKey();
if (url == null) continue;
u = url.toNormalform(true, false);
if ((u != null) && (u.startsWith("mailto:"))) {
emaillinks.put(u.substring(7), entry.getValue());
} else {
@ -328,21 +358,16 @@ public class plasmaParserDocument {
} else {
ext = u.substring(extpos + 1).toLowerCase();
}
try {
url = new yacyURL(u, null);
u = url.toNormalform(true, true);
if (plasmaParser.mediaExtContains(ext)) {
// this is not a normal anchor, its a media link
if (plasmaParser.imageExtContains(ext)) {
collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
}
else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, (String)entry.getValue());
else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, (String)entry.getValue());
else if (plasmaParser.appsExtContains(ext)) applinks.put(u, (String)entry.getValue());
} else {
hyperlinks.put(u, (String)entry.getValue());
if (plasmaParser.mediaExtContains(ext)) {
// this is not a normal anchor, its a media link
if (plasmaParser.imageExtContains(ext)) {
collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
}
} catch (MalformedURLException e1) {
else if (plasmaParser.audioExtContains(ext)) audiolinks.put(url, (String)entry.getValue());
else if (plasmaParser.videoExtContains(ext)) videolinks.put(url, (String)entry.getValue());
else if (plasmaParser.appsExtContains(ext)) applinks.put(url, (String)entry.getValue());
} else {
hyperlinks.put(url, (String) entry.getValue());
}
}
}
@ -378,12 +403,12 @@ public class plasmaParserDocument {
this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
if (this.title.length() > 0) this.title.append('\n');
this.title.append(doc.getTitle());
this.title.append(doc.dc_title());
this.keywords.addAll(doc.getKeywords());
if (this.abstrct.length() > 0) this.abstrct.append('\n');
this.abstrct.append(doc.getAbstract());
if (this.description.length() > 0) this.description.append('\n');
this.description.append(doc.dc_description());
if (!(this.text instanceof serverCachedFileOutputStream)) {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);

@ -79,11 +79,11 @@ public final class plasmaSearchImages {
// add also links from pages one step deeper, if depth > 0
if (depth > 0) {
Iterator<String> i = document.getHyperlinks().keySet().iterator();
Iterator<yacyURL> i = document.getHyperlinks().keySet().iterator();
String nexturlstring;
while (i.hasNext()) {
try {
nexturlstring = new yacyURL(i.next(), null).toNormalform(true, true);
nexturlstring = i.next().toNormalform(true, true);
addAll(new plasmaSearchImages(serverDate.remainingTime(start, maxTime, 10), new yacyURL(nexturlstring, null), depth - 1));
} catch (MalformedURLException e1) {
e1.printStackTrace();

@ -228,8 +228,9 @@ public class plasmaSnippetCache {
public static class MediaSnippet {
public int type;
public String href, name, attr;
public MediaSnippet(int type, String href, String name, String attr) {
public yacyURL href;
public String name, attr;
public MediaSnippet(int type, yacyURL href, String name, String attr) {
this.type = type;
this.href = href;
this.name = name;
@ -469,17 +470,18 @@ public class plasmaSnippetCache {
return snippetsCache.get(key);
}
private static String computeMediaSnippet(Map<String, String> media, Set<String> queryhashes) {
Iterator<Map.Entry<String, String>> i = media.entrySet().iterator();
Map.Entry<String, String> entry;
String url, desc;
private static String computeMediaSnippet(Map<yacyURL, String> media, Set<String> queryhashes) {
Iterator<Map.Entry<yacyURL, String>> i = media.entrySet().iterator();
Map.Entry<yacyURL, String> entry;
yacyURL url;
String desc;
Set<String> s;
String result = "";
while (i.hasNext()) {
entry = i.next();
url = entry.getKey();
desc = entry.getValue();
s = removeAppearanceHashes(url, queryhashes);
s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes);
if (s.size() == 0) {
result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
continue;
@ -643,22 +645,23 @@ public class plasmaSnippetCache {
public static ArrayList<MediaSnippet> computeMediaSnippets(plasmaParserDocument document, Set<String> queryhashes, int mediatype) {
if (document == null) return new ArrayList<MediaSnippet>();
Map<String, String> media = null;
Map<yacyURL, String> media = null;
if (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO) media = document.getAudiolinks();
else if (mediatype == plasmaSearchQuery.CONTENTDOM_VIDEO) media = document.getVideolinks();
else if (mediatype == plasmaSearchQuery.CONTENTDOM_APP) media = document.getApplinks();
if (media == null) return null;
Iterator<Map.Entry<String, String>> i = media.entrySet().iterator();
Map.Entry<String, String> entry;
String url, desc;
Iterator<Map.Entry<yacyURL, String>> i = media.entrySet().iterator();
Map.Entry<yacyURL, String> entry;
yacyURL url;
String desc;
Set<String> s;
ArrayList<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {
entry = i.next();
url = entry.getKey();
desc = entry.getValue();
s = removeAppearanceHashes(url, queryhashes);
s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes);
if (s.size() == 0) {
result.add(new MediaSnippet(mediatype, url, desc, null));
continue;
@ -678,14 +681,15 @@ public class plasmaSnippetCache {
Iterator<htmlFilterImageEntry> i = images.iterator();
htmlFilterImageEntry ientry;
String url, desc;
yacyURL url;
String desc;
Set<String> s;
ArrayList<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {
ientry = i.next();
url = ientry.url().toNormalform(true, true);
url = ientry.url();
desc = ientry.alt();
s = removeAppearanceHashes(url, queryhashes);
s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes);
if (s.size() == 0) {
result.add(new MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height()));
continue;

@ -138,7 +138,7 @@ import de.anomic.plasma.crawler.plasmaCrawlQueues;
import de.anomic.plasma.crawler.plasmaProtocolLoader;
import de.anomic.plasma.dbImport.dbImportManager;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.plasmaSwitchboardQueue.Entry;
import de.anomic.plasma.plasmaCondenser.wordStatProp;
import de.anomic.plasma.urlPattern.defaultURLPattern;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverAbstractSwitch;
@ -998,8 +998,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this.log.logConfig("Starting blacklist engine ...");
try {
Class blacklistClass = Class.forName(blacklistClassName);
Constructor blacklistClassConstr = blacklistClass.getConstructor( new Class[] { File.class } );
Class<?> blacklistClass = Class.forName(blacklistClassName);
Constructor<?> blacklistClassConstr = blacklistClass.getConstructor( new Class[] { File.class } );
urlBlacklist = (plasmaURLPattern) blacklistClassConstr.newInstance(new Object[] { blacklistsPath });
this.log.logFine("Used blacklist engine class: " + blacklistClassName);
this.log.logConfig("Using blacklist engine: " + urlBlacklist.getEngineInfo());
@ -1276,8 +1276,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String wikiParserClassName = getConfig(WIKIPARSER_CLASS, WIKIPARSER_CLASS_DEFAULT);
this.log.logConfig("Loading wiki parser " + wikiParserClassName + " ...");
try {
Class wikiParserClass = Class.forName(wikiParserClassName);
Constructor wikiParserClassConstr = wikiParserClass.getConstructor(new Class[] { plasmaSwitchboard.class });
Class<?> wikiParserClass = Class.forName(wikiParserClassName);
Constructor<?> wikiParserClassConstr = wikiParserClass.getConstructor(new Class[] { plasmaSwitchboard.class });
wikiParser = (wikiParser)wikiParserClassConstr.newInstance(new Object[] { this });
} catch (Exception e) {
this.log.logSevere("Unable to load wiki parser, the wiki won't work", e);
@ -1532,11 +1532,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this.defaultRemoteProfile = null;
this.defaultTextSnippetProfile = null;
this.defaultMediaSnippetProfile = null;
Iterator i = this.profilesActiveCrawls.profiles(true);
Iterator<plasmaCrawlProfile.entry> i = this.profilesActiveCrawls.profiles(true);
plasmaCrawlProfile.entry profile;
String name;
while (i.hasNext()) {
profile = (plasmaCrawlProfile.entry) i.next();
profile = i.next();
name = profile.name();
if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile;
if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile;
@ -1606,7 +1606,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(crawlStacker != null && crawlStacker.size() > 0) ||
(crawlQueues.noticeURL.notEmpty()))
return false;
final Iterator iter = profilesActiveCrawls.profiles(true);
final Iterator<plasmaCrawlProfile.entry> iter = profilesActiveCrawls.profiles(true);
plasmaCrawlProfile.entry entry;
boolean hasDoneSomething = false;
try {
@ -1615,7 +1615,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress");
// getting next profile
entry = (plasmaCrawlProfile.entry) iter.next();
entry = iter.next();
if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) ||
(entry.name().equals(CRAWL_PROFILE_REMOTE)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_TEXT)) ||
@ -1961,13 +1961,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// clean up seed-dbs
if(getConfigBool("routing.deleteOldSeeds.permission",true)) {
final long deleteOldSeedsTime = getConfigLong("routing.deleteOldSeeds.time",7)*24*3600000;
Iterator e = yacyCore.seedDB.seedsSortedDisconnected(true,yacySeed.LASTSEEN);
Iterator<yacySeed> e = yacyCore.seedDB.seedsSortedDisconnected(true,yacySeed.LASTSEEN);
yacySeed seed = null;
ArrayList deleteQueue = new ArrayList();
ArrayList<String> deleteQueue = new ArrayList<String>();
checkInterruption();
//clean passive seeds
while(e.hasNext()) {
seed = (yacySeed)e.next();
seed = e.next();
if(seed != null) {
//list is sorted -> break when peers are too young to delete
if(seed.getLastSeenUTC() > (System.currentTimeMillis()-deleteOldSeedsTime))
@ -2183,24 +2183,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
((processCase == PROCESSCASE_4_PROXY_LOAD) || (processCase == PROCESSCASE_5_LOCAL_CRAWLING)) &&
((entry.profile() == null) || (entry.depth() < entry.profile().generalDepth()))
) {
Map hl = document.getHyperlinks();
Iterator i = hl.entrySet().iterator();
String nextUrlString;
Map<yacyURL, String> hl = document.getHyperlinks();
Iterator<Map.Entry<yacyURL, String>> i = hl.entrySet().iterator();
yacyURL nextUrl;
Map.Entry nextEntry;
Map.Entry<yacyURL, String> nextEntry;
while (i.hasNext()) {
// check for interruption
checkInterruption();
// fetching the next hyperlink
nextEntry = (Map.Entry) i.next();
nextUrlString = (String) nextEntry.getKey();
try {
nextUrl = new yacyURL(nextUrlString, null);
// enqueue the hyperlink into the pre-notice-url db
crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
} catch (MalformedURLException e1) {}
nextEntry = i.next();
nextUrl = nextEntry.getKey();
// enqueue the hyperlink into the pre-notice-url db
crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
}
log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
", NEW CRAWL STACK SIZE IS " + crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
@ -2210,7 +2205,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* =========================================================================
* CREATE INDEX
* ========================================================================= */
String docDescription = document.getTitle();
String docDescription = document.dc_title();
yacyURL referrerURL = entry.referrerURL();
String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR;
@ -2242,8 +2237,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
indexURLEntry newEntry = new indexURLEntry(
entry.url(), // URL
docDescription, // document description
document.getAuthor(), // author
document.getKeywords(' '), // tags
document.dc_creator(), // author
document.dc_subject(' '), // tags
"", // ETag
docDate, // modification date
new Date(), // loaded date
@ -2252,7 +2247,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
new byte[0], // md5
(int) entry.size(), // size
condenser.RESULT_NUMB_WORDS, // word count
plasmaHTCache.docType(document.getMimeType()), // doctype
plasmaHTCache.docType(document.dc_format()), // doctype
condenser.RESULT_FLAGS, // flags
yacyURL.language(entry.url()), // language
ioLinks[0].intValue(), // llocal
@ -2314,7 +2309,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
document, // document content
condenser, // document condenser
yacyURL.language(entry.url()), // document language
plasmaHTCache.docType(document.getMimeType()),// document type
plasmaHTCache.docType(document.dc_format()),// document type
ioLinks[0].intValue(), // outlinkSame
ioLinks[1].intValue() // outlinkOthers
);
@ -2322,31 +2317,31 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* ========================================================================
* SEND PAGE INDEX TO STORAGE PEER
* ======================================================================== */
HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry);
HashMap<String, indexURLEntry> urlCache = new HashMap<String, indexURLEntry>(1);
urlCache.put(newEntry.hash(), newEntry);
ArrayList tmpContainers = new ArrayList(condenser.words().size());
ArrayList<indexContainer> tmpContainers = new ArrayList<indexContainer>(condenser.words().size());
String language = yacyURL.language(entry.url());
char doctype = plasmaHTCache.docType(document.getMimeType());
char doctype = plasmaHTCache.docType(document.dc_format());
indexURLEntry.Components comp = newEntry.comp();
int urlLength = comp.url().toNormalform(true, true).length();
int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform(true, true)).length;
// iterate over all words
Iterator i = condenser.words().entrySet().iterator();
Map.Entry wentry;
Iterator<Map.Entry<String, wordStatProp>> i = condenser.words().entrySet().iterator();
Map.Entry<String, wordStatProp> wentry;
plasmaCondenser.wordStatProp wordStat;
while (i.hasNext()) {
wentry = (Map.Entry) i.next();
String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
wentry = i.next();
String word = wentry.getKey();
wordStat = wentry.getValue();
String wordHash = plasmaCondenser.word2hash(word);
indexRWIEntry wordIdxEntry = new indexRWIRowEntry(
urlHash,
urlLength, urlComps,
wordStat.count,
document.getTitle().length(),
document.dc_title().length(),
condenser.words().size(),
condenser.sentences().size(),
wordStat.posInText,
@ -2371,7 +2366,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// transfering the index to the storage peer
indexContainer[] indexData = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]);
HashMap resultObj = yacyClient.transferIndex(
HashMap<String, Object> resultObj = yacyClient.transferIndex(
seed, // target seed
indexData, // word index data
urlCache, // urls
@ -2392,7 +2387,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
document,
condenser,
yacyURL.language(entry.url()),
plasmaHTCache.docType(document.getMimeType()),
plasmaHTCache.docType(document.dc_format()),
ioLinks[0].intValue(),
ioLinks[1].intValue()
);
@ -2412,7 +2407,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
" [" + entry.urlHash() + "]" +
"\n\tDescription: " + docDescription +
"\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getCharset() + " | " +
"\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.getTextLength() + " bytes | " +
"Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
"\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " +
@ -2548,7 +2543,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaParserDocument document = plasmaSnippetCache.parseDocument(comp.url(), resourceContentLength.longValue(), resourceContent);
// get the word set
Set words = null;
Set<String> words = null;
try {
words = new plasmaCondenser(document, true, true).words().keySet();
} catch (UnsupportedEncodingException e) {
@ -2642,10 +2637,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public static int accessFrequency(HashMap tracker, String host) {
public static int accessFrequency(HashMap<String, TreeSet<Long>> tracker, String host) {
// returns the access frequency in queries per hour for a given host and a specific tracker
long timeInterval = 1000 * 60 * 60;
TreeSet accessSet = (TreeSet) tracker.get(host);
TreeSet<Long> accessSet = tracker.get(host);
if (accessSet == null) return 0;
return accessSet.tailSet(new Long(System.currentTimeMillis() - timeInterval)).size();
}
@ -2769,7 +2764,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
try {
// find a list of DHT-peers
double maxDist = 0.2;
ArrayList seeds = yacyCore.dhtAgent.getDHTTargets(log, peerCount, Math.min(8, (int) (yacyCore.seedDB.sizeConnected() * maxDist)), dhtChunk.firstContainer().getWordHash(), dhtChunk.lastContainer().getWordHash(), maxDist);
ArrayList<yacySeed> seeds = yacyCore.dhtAgent.getDHTTargets(log, peerCount, Math.min(8, (int) (yacyCore.seedDB.sizeConnected() * maxDist)), dhtChunk.firstContainer().getWordHash(), dhtChunk.lastContainer().getWordHash(), maxDist);
if (seeds.size() < peerCount) {
log.logWarning("found not enough (" + seeds.size() + ") peers for distribution for dhtchunk [" + dhtChunk.firstContainer().getWordHash() + " .. " + dhtChunk.lastContainer().getWordHash() + "]");
return false;
@ -2784,8 +2779,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int retries = 0;
// starting up multiple DHT transfer threads
Iterator seedIter = seeds.iterator();
ArrayList transfer = new ArrayList(peerCount);
Iterator<yacySeed> seedIter = seeds.iterator();
ArrayList<plasmaDHTTransfer> transfer = new ArrayList<plasmaDHTTransfer>(peerCount);
while (hc1 < peerCount && (transfer.size() > 0 || seedIter.hasNext())) {
// starting up some transfer threads
@ -2804,12 +2799,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// waiting for the transfer threads to finish
Iterator transferIter = transfer.iterator();
Iterator<plasmaDHTTransfer> transferIter = transfer.iterator();
while (transferIter.hasNext()) {
// check for interruption
checkInterruption();
plasmaDHTTransfer t = (plasmaDHTTransfer)transferIter.next();
plasmaDHTTransfer t = transferIter.next();
if (!t.isAlive()) {
// remove finished thread from the list
transferIter.remove();

@ -29,14 +29,13 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ConcurrentModificationException;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.kelondro.kelondroBase64Order;
@ -97,8 +96,8 @@ public class plasmaWebStructure {
assert url.hash().equals(baseurlhash);
// generate citation reference
Map<String, String> hl = document.getHyperlinks();
Iterator<String> it = hl.keySet().iterator();
Map<yacyURL, String> hl = document.getHyperlinks();
Iterator<yacyURL> it = hl.keySet().iterator();
String nexturlhash;
StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1);
StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1);
@ -106,20 +105,18 @@ public class plasmaWebStructure {
int GCount = 0;
int LCount = 0;
while (it.hasNext()) {
try {
nexturlhash = (new yacyURL(it.next(), null)).hash();
if (nexturlhash != null) {
if (nexturlhash.substring(6).equals(lhp)) {
// this is a inbound link
cpl.append(nexturlhash.substring(0, 6)); // store only local part
LCount++;
} else {
// this is a outbound link
cpg.append(nexturlhash); // store complete hash
GCount++;
}
nexturlhash = it.next().hash();
if (nexturlhash != null) {
if (nexturlhash.substring(6).equals(lhp)) {
// this is a inbound link
cpl.append(nexturlhash.substring(0, 6)); // store only local part
LCount++;
} else {
// this is a outbound link
cpg.append(nexturlhash); // store complete hash
GCount++;
}
} catch (MalformedURLException e) {}
}
}
// append this reference to buffer

@ -301,7 +301,7 @@ public final class plasmaWordIndex implements indexRI {
wprop = wentry.getValue();
assert (wprop.flags != null);
ientry = new indexRWIRowEntry(url.hash(),
urlLength, urlComps, (document == null) ? urlLength : document.getTitle().length(),
urlLength, urlComps, (document == null) ? urlLength : document.dc_title().length(),
wprop.count,
condenser.words().size(),
condenser.sentences().size(),

@ -55,6 +55,7 @@ import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIRowEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
@ -575,19 +576,19 @@ public final class yacyClient {
// read index abstract
if (abstractCache != null) {
Iterator i = result.entrySet().iterator();
Map.Entry entry;
TreeMap singleAbstract;
Iterator<Map.Entry<String, String>> i = result.entrySet().iterator();
Map.Entry<String, String> entry;
TreeMap<String, String> singleAbstract;
String wordhash;
serverByteBuffer ci;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
if (((String) entry.getKey()).startsWith("indexabstract.")) {
wordhash = ((String) entry.getKey()).substring(14);
entry = i.next();
if (entry.getKey().startsWith("indexabstract.")) {
wordhash = entry.getKey().substring(14);
synchronized (abstractCache) {
singleAbstract = (TreeMap<String, String>) abstractCache.get(wordhash); // a mapping from url-hashes to a string of peer-hashes
if (singleAbstract == null) singleAbstract = new TreeMap();
ci = new serverByteBuffer(((String) entry.getValue()).getBytes());
if (singleAbstract == null) singleAbstract = new TreeMap<String, String>();
ci = new serverByteBuffer(entry.getValue().getBytes());
//System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString());
indexContainer.decompressIndex(singleAbstract, ci, target.hash);
abstractCache.put(wordhash, singleAbstract);
@ -621,7 +622,7 @@ public final class yacyClient {
return urls;
}
public static HashMap permissionMessage(String targetHash) {
public static HashMap<String, String> permissionMessage(String targetHash) {
// ask for allowed message size and attachement size
// if this replies null, the peer does not answer
if (yacyCore.seedDB == null || yacyCore.seedDB.mySeed() == null) { return null; }
@ -651,7 +652,7 @@ public final class yacyClient {
}
}
public static HashMap postMessage(String targetHash, String subject, byte[] message) {
public static HashMap<String, String> postMessage(String targetHash, String subject, byte[] message) {
// this post a message to the remote message board
// prepare request
@ -699,7 +700,7 @@ public final class yacyClient {
return address;
}
public static HashMap transferPermission(String targetAddress, long filesize, String filename) {
public static HashMap<String, String> transferPermission(String targetAddress, long filesize, String filename) {
// prepare request
final serverObjects post = yacyNetwork.basicRequestPost(plasmaSwitchboard.getSwitchboard(), null);
@ -731,7 +732,7 @@ public final class yacyClient {
}
}
public static HashMap transferStore(String targetAddress, String access, String filename, byte[] file) {
public static HashMap<String, String> transferStore(String targetAddress, String access, String filename, byte[] file) {
// prepare request
final serverObjects post = yacyNetwork.basicRequestPost(plasmaSwitchboard.getSwitchboard(), null);
@ -741,7 +742,7 @@ public final class yacyClient {
post.put("filesize", Long.toString(file.length));
post.put("md5", serverCodings.encodeMD5Hex(file));
post.put("access", access);
HashMap files = new HashMap();
HashMap<String, byte[]> files = new HashMap<String, byte[]>();
files.put("filename", file);
// send request
@ -766,7 +767,7 @@ public final class yacyClient {
}
public static String transfer(String targetAddress, String filename, byte[] file) {
HashMap phase1 = transferPermission(targetAddress, file.length, filename);
HashMap<String, String> phase1 = transferPermission(targetAddress, file.length, filename);
if (phase1 == null) return "no connection to remote address " + targetAddress + "; phase 1";
String access = (String) phase1.get("access");
String nextaddress = (String) phase1.get("address");
@ -778,7 +779,7 @@ public final class yacyClient {
if (!(response.equals("ok"))) return "remote peer rejected transfer: " + response;
String accesscode = serverCodings.encodeMD5Hex(kelondroBase64Order.standardCoder.encodeString(access));
if (protocol.equals("http")) {
HashMap phase2 = transferStore(nextaddress, accesscode, filename, file);
HashMap<String, String> phase2 = transferStore(nextaddress, accesscode, filename, file);
if (phase2 == null) return "no connection to remote address " + targetAddress + "; phase 2";
response = (String) phase2.get("response");
if (response == null) return "wrong return values from other peer; phase 2";
@ -848,14 +849,14 @@ public final class yacyClient {
}
}
public static HashMap transferIndex(yacySeed targetSeed, indexContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
public static HashMap<String, Object> transferIndex(yacySeed targetSeed, indexContainer[] indexes, HashMap<String, indexURLEntry> urlCache, boolean gzipBody, int timeout) {
HashMap resultObj = new HashMap();
HashMap<String, Object> resultObj = new HashMap<String, Object>();
int payloadSize = 0;
try {
// check if we got all necessary urls in the urlCache (only for debugging)
Iterator eenum;
Iterator<indexRWIRowEntry> eenum;
indexRWIEntry entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries();
@ -879,13 +880,13 @@ public final class yacyClient {
String result = (String) in.get("result");
if (result == null) {
resultObj.put("result","no_result_1");
resultObj.put("result", "no_result_1");
return resultObj;
}
if (!(result.equals("ok"))) {
targetSeed.setFlagAcceptRemoteIndex(false);
yacyCore.seedDB.update(targetSeed.hash, targetSeed);
resultObj.put("result",result);
resultObj.put("result", result);
return resultObj;
}
@ -938,7 +939,7 @@ public final class yacyClient {
}
}
private static HashMap transferRWI(yacySeed targetSeed, indexContainer[] indexes, boolean gzipBody, int timeout) {
private static HashMap<String, String> transferRWI(yacySeed targetSeed, indexContainer[] indexes, boolean gzipBody, int timeout) {
final String address = targetSeed.getPublicAddress();
if (address == null) { return null; }
@ -953,7 +954,7 @@ public final class yacyClient {
int indexcount = 0;
final StringBuffer entrypost = new StringBuffer(indexes.length*73);
Iterator eenum;
Iterator<indexRWIRowEntry> eenum;
indexRWIEntry entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries();
@ -968,7 +969,7 @@ public final class yacyClient {
if (indexcount == 0) {
// nothing to do but everything ok
final HashMap result = new HashMap(2);
final HashMap<String, String> result = new HashMap<String, String>(2);
result.put("result", "ok");
result.put("unknownURL", "");
return result;

@ -891,7 +891,7 @@ public class yacyURL {
}
public int hashCode() {
return this.toString().hashCode();
return this.hash().hashCode();
}
public int compareTo(Object h) {

@ -29,7 +29,6 @@ package de.anomic.yacy;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
@ -298,17 +297,13 @@ public final class yacyVersion implements Comparator<yacyVersion>, Comparable<ya
}
// analyse links in scraper resource, and find link to latest release in it
Map<String, String> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
Iterator<String> i = anchors.keySet().iterator();
Map<yacyURL, String> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
Iterator<yacyURL> i = anchors.keySet().iterator();
TreeSet<yacyVersion> devreleases = new TreeSet<yacyVersion>();
TreeSet<yacyVersion> mainreleases = new TreeSet<yacyVersion>();
yacyVersion release;
while (i.hasNext()) {
try {
url = new yacyURL((String) i.next(), null);
} catch (MalformedURLException e1) {
continue; // just ignore invalid urls
}
url = i.next();
try {
release = new yacyVersion(url);
//System.out.println("r " + release.toAnchor());

Loading…
Cancel
Save