result heuristic (also used in greedy learning mode) to use outbound links if result is full index doc. Otherwise use default loader methode.

- Above brought up that parser start url parameter, declared as AnchorURL uses only methodes of parent object DigestURL (changed parameter declaration accordingly).
pull/44/head
reger 9 years ago
parent caf9e98f09
commit 06d0e2aeb9

@ -143,7 +143,7 @@ public class yacysearchitem {
if (authenticated) { // only needed if authorized
boolean bookmarkexists;
// check url exists in bookkmarks
bookmarkexists = sb.bookmarksDB.getBookmark(urlhash) != null;
bookmarkexists = sb.bookmarksDB.getBookmark(urlhash) != null;
prop.put("content_authorized_bookmark", !bookmarkexists);
// bookmark icon check for YMarks
//prop.put("content_authorized_bookmark", sb.tables.bookmarks.hasBookmark("admin", urlhash) ? "0" : "1");
@ -188,7 +188,6 @@ public class yacysearchitem {
prop.putXML("content_link", resultUrlstring); // putXML for rss
}
// prop.putHTML("content_value", Interaction.TripleGet(result.urlstring(), "http://virtual.x/hasvalue", "anonymous"));
// END interaction
boolean isAtomFeed = header.get(HeaderFramework.CONNECTION_PROP_EXT, "").equals("atom");
@ -303,7 +302,7 @@ public class yacysearchitem {
boolean stealthmode = p2pmode && theSearch.query.isLocal();
if ((sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) ||
(sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) && sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ENABLED, false) && Memory.load() < 1.0)) &&
!stealthmode) sb.heuristicSearchResults(resultUrlstring);
!stealthmode) sb.heuristicSearchResults(result);
theSearch.query.transmitcount = item + 1;
return prop;
}

@ -859,7 +859,7 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
} catch (final Exception e) {
return null;
}

@ -26,7 +26,7 @@ package net.yacy.document;
import java.io.InputStream;
import java.util.Set;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
public interface Parser {
@ -55,7 +55,7 @@ public interface Parser {
* @throws InterruptedException
*/
public Document[] parse(
AnchorURL url,
DigestURL url,
String mimeType,
String charset,
VocabularyScraper scraper,

@ -34,7 +34,7 @@ import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.parser.apkParser;
@ -161,7 +161,7 @@ public final class TextParser {
}
public static Document[] parseSource(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -194,7 +194,7 @@ public final class TextParser {
}
public static Document[] parseSource(
final AnchorURL location,
final DigestURL location,
String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -220,7 +220,7 @@ public final class TextParser {
}
public static Document[] parseSource(
final AnchorURL location,
final DigestURL location,
String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -262,7 +262,7 @@ public final class TextParser {
}
private static Document[] parseSource(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final Parser parser,
final String charset,
@ -285,7 +285,7 @@ public final class TextParser {
}
private static Document[] parseSource(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final Set<Parser> parsers,
final String charset,

@ -40,6 +40,7 @@ import java.util.jar.JarEntry;
import java.util.jar.JarFile;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -56,7 +57,7 @@ public class apkParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -91,7 +92,7 @@ public class apkParser extends AbstractParser implements Parser {
return docs;
}
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final JarFile jf) {
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final JarFile jf) {
StringBuilder sb = new StringBuilder();
String title = location.getFileName();
AndroidManifestParser manifest = null;
@ -142,11 +143,11 @@ public class apkParser extends AbstractParser implements Parser {
null,
null,
singleList(title),
"",
null,
manifest == null ? "" : manifest.packageName,
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
sb.toString(),
links,
null,

@ -35,7 +35,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -71,7 +71,7 @@ public class audioTagParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -172,7 +172,7 @@ public class audioTagParser extends AbstractParser implements Parser {
location.getHost(), // publisher
null, // sections
descriptions, // abstrct
0.0f, 0.0f, // lon, lat
0.0d, 0.0d, // lon, lat
text.toString(), // text
null,
null,
@ -191,11 +191,11 @@ public class audioTagParser extends AbstractParser implements Parser {
null,
null,
singleList(filename), // title
"", // author
null, // author
location.getHost(),
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
location.toTokens(),
null,
null,

@ -6,7 +6,6 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.ymark.YMarkUtil;
@ -39,7 +38,7 @@ public class AugmentParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -32,7 +32,6 @@ import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
@ -65,7 +64,7 @@ public class bzipParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -33,7 +33,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -54,7 +54,7 @@ public class csvParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -77,11 +77,11 @@ public class csvParser extends AbstractParser implements Parser {
null,
null,
singleList(concatRow(table.get(0))),
"",
null,
"",
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
sb.toString(),
null,
null,

@ -32,7 +32,7 @@ import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -60,7 +60,7 @@ public class docParser extends AbstractParser implements Parser {
@SuppressWarnings("deprecation")
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -121,7 +121,7 @@ public class docParser extends AbstractParser implements Parser {
extractor.getDocSummaryInformation().getCompany(), // publisher
null,
descriptions,
0.0f, 0.0f,
0.0d, 0.0d,
contents.toString(),
null,
null,

@ -25,7 +25,7 @@ package net.yacy.document.parser;
import java.io.InputStream;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -62,7 +62,7 @@ public class dwgParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -25,9 +25,8 @@
package net.yacy.document.parser;
import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -47,7 +46,7 @@ public class genericParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -63,17 +62,17 @@ public class genericParser extends AbstractParser implements Parser {
null,
null,
singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
"", // author
null, // author
location.getHost(),
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
location.toTokens(),
null,
null,
null,
false,
new Date())};
null)};
return docs;
}
}

@ -33,7 +33,6 @@ import java.io.InputStream;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
@ -64,7 +63,7 @@ public class gzipParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -39,7 +39,6 @@ import java.util.LinkedHashMap;
import java.util.Set;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
@ -88,7 +87,7 @@ public class htmlParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String documentCharset,
final VocabularyScraper vocscraper,
@ -382,9 +381,9 @@ public class htmlParser extends AbstractParser implements Parser {
public static void main(final String[] args) {
// test parsing of a url
AnchorURL url;
DigestURL url;
try {
url = new AnchorURL(args[0]);
url = new DigestURL(args[0]);
final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null);
final Document[] document = new htmlParser().parse(url, "text/html", StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, new ByteArrayInputStream(content));
final String title = document[0].dc_title();

@ -84,7 +84,7 @@ public class genericImageParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -213,7 +213,7 @@ public class genericImageParser extends AbstractParser implements Parser {
}
private ImageInfo parseJavaImage(
final AnchorURL location,
final DigestURL location,
final InputStream sourceStream) throws Parser.Failure {
BufferedImage image = null;
try {
@ -228,7 +228,7 @@ public class genericImageParser extends AbstractParser implements Parser {
}
private ImageInfo parseJavaImage(
final AnchorURL location,
final DigestURL location,
final BufferedImage image) {
final ImageInfo ii = new ImageInfo(location);
ii.image = image;
@ -265,12 +265,12 @@ public class genericImageParser extends AbstractParser implements Parser {
}
private class ImageInfo {
public AnchorURL location;
public DigestURL location;
public BufferedImage image;
public StringBuilder info;
public int height;
public int width;
public ImageInfo(final AnchorURL location) {
public ImageInfo(final DigestURL location) {
this.location = location;
this.image = null;
this.info = new StringBuilder();

@ -44,7 +44,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -85,7 +85,7 @@ public class metadataImageParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -167,7 +167,7 @@ public class metadataImageParser extends AbstractParser implements Parser {
new HashSet<String>(0), // languages
keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords
singleList(title), // title
author == null ? "" : author, // author
author == null ? null : author, // author
location.getHost(), // Publisher
null, // sections
descriptions, // description

@ -28,7 +28,6 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
@ -80,7 +79,7 @@ public class svgParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -128,7 +127,7 @@ public class svgParser extends AbstractParser implements Parser {
"",
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
docDescription, // text - for this image description is best text we have
null,
null,

@ -21,9 +21,8 @@
package net.yacy.document.parser;
import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -60,7 +59,7 @@ public class linkScraperParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -81,17 +80,17 @@ public class linkScraperParser extends AbstractParser implements Parser {
null,
null,
singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
"", // author
null, // author
location.getHost(),
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
location.toTokens(),
htmlParserDoc == null ? null : htmlParserDoc.getAnchors(),
htmlParserDoc == null ? null : htmlParserDoc.getRSS(),
htmlParserDoc == null ? null : htmlParserDoc.getImages(),
false,
new Date())};
null)};
return docs;
}
}

@ -36,7 +36,7 @@ import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -73,7 +73,7 @@ public class mmParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -119,7 +119,7 @@ public class mmParser extends AbstractParser implements Parser {
null,
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
content,
null,
null,

@ -44,7 +44,6 @@ import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -220,7 +219,7 @@ public class odtParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -44,7 +44,7 @@ import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -93,7 +93,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
return parser;
}
private Document[] parse(final AnchorURL location, final String mimeType, @SuppressWarnings("unused") final String charset, final File dest) throws Parser.Failure, InterruptedException {
private Document[] parse(final DigestURL location, final String mimeType, @SuppressWarnings("unused") final String charset, final File dest) throws Parser.Failure, InterruptedException {
CharBuffer writer = null;
try {
@ -206,7 +206,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -55,6 +55,7 @@ import org.apache.pdfbox.util.PDFTextStripper;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -89,7 +90,7 @@ public class pdfParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -253,7 +254,7 @@ public class pdfParser extends AbstractParser implements Parser {
docPublisher,
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
contentBytes,
pdflinksCombined,
null,

@ -33,7 +33,7 @@ import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -65,7 +65,7 @@ public class pptParser extends AbstractParser implements Parser {
*/
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -114,7 +114,7 @@ public class pptParser extends AbstractParser implements Parser {
pptExtractor.getDocSummaryInformation().getCompany(),
null,
descriptions,
0.0f, 0.0f,
0.0d, 0.0d,
contents,
null,
null,

@ -37,7 +37,6 @@ import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Date;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -110,7 +109,7 @@ public class psParser extends AbstractParser implements Parser {
null, // languages
null, // keywords
null, // title
"", // author
null, // author
"", // publisher
null, // sections
null, // abstract
@ -259,7 +258,7 @@ public class psParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -30,7 +30,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -47,7 +47,7 @@ public class rdfParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -65,8 +65,8 @@ public class rdfParser extends AbstractParser implements Parser {
Document doc;
String all = "rdfdatasource";
doc = new Document(location, mimeType, charset, null, null, null, singleList(""), "",
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false, new Date());
doc = new Document(location, mimeType, charset, null, null, null, singleList(""), null,
"", null, null, 0, 0, all, null, null, null, false, new Date());
docs.add(doc);

@ -17,7 +17,6 @@ import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -49,7 +48,7 @@ public class RDFaParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL url,
final DigestURL url,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -103,7 +102,7 @@ public class RDFaParser extends AbstractParser implements Parser {
}
private Document[] parseHtml(
final AnchorURL url,
final DigestURL url,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -190,7 +189,7 @@ public class RDFaParser extends AbstractParser implements Parser {
if (aReader != null) {
RDFaParser aParser = new RDFaParser();
try {
aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), 0, aURL.openStream());
aParser.parse(new DigestURL(args[0]), "", "", new VocabularyScraper(), 0, aURL.openStream());
} catch (final FileNotFoundException e) {
e.printStackTrace();
} catch (final IOException e) {

@ -37,7 +37,6 @@ import java.util.Set;
import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -60,7 +59,7 @@ public class rssParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -77,11 +76,11 @@ public class rssParser extends AbstractParser implements Parser {
final RSSFeed feed = rssReader.getFeed();
//RSSMessage channel = feed.getChannel();
final List<Document> docs = new ArrayList<Document>();
AnchorURL itemuri;
DigestURL itemuri;
Set<String> languages;
Document doc;
for (final Hit item: feed) try {
itemuri = new AnchorURL(item.getLink());
itemuri = new DigestURL(item.getLink());
languages = new HashSet<String>();
languages.add(item.getLanguage());
doc = new Document(

@ -29,12 +29,11 @@ package net.yacy.document.parser;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Date;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -55,7 +54,7 @@ public class rtfParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -83,17 +82,17 @@ public class rtfParser extends AbstractParser implements Parser {
replaceAll("\n"," ").
replaceAll("\r"," ").
replaceAll("\t"," ")),
"", // TODO: AUTHOR
null, // TODO: AUTHOR
"", // TODO: publisher
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
bodyText,
null,
null,
null,
false,
new Date())};
null)};
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -35,6 +35,7 @@ import java.io.OutputStream;
import java.util.Date;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -58,7 +59,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
}
public Document parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final int timezoneOffset,
@ -110,7 +111,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
}
public Document parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final int timezoneOffset,
@ -120,7 +121,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -31,7 +31,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -59,7 +59,7 @@ public class sidAudioParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -97,7 +97,7 @@ public class sidAudioParser extends AbstractParser implements Parser {
header.get("publisher"),
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
null,
null,
null,

@ -40,7 +40,6 @@ import java.util.zip.GZIPInputStream;
import javax.xml.parsers.DocumentBuilderFactory;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@ -52,7 +51,6 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.io.ByteCountInputStream;
import org.w3c.dom.CharacterData;
@ -71,7 +69,7 @@ public class sitemapParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -94,15 +92,15 @@ public class sitemapParser extends AbstractParser implements Parser {
null,
null,
singleList(""),
null,
"",
"",
null,
new ArrayList<String>(),
0.0f, 0.0f,
null,
0.0d, 0.0d,
null,
null,
null,
null,
new LinkedHashMap<DigestURL, ImageEntry>(),
false,
new Date());
docs.add(doc);

@ -31,7 +31,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -56,7 +56,7 @@ public class swfParser extends AbstractParser implements Parser {
*/
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -34,6 +34,7 @@ import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -65,7 +66,7 @@ public class tarParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -34,7 +34,7 @@ import java.util.List;
import java.util.Map;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Condenser;
@ -59,7 +59,7 @@ public class torrentParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -114,7 +114,7 @@ public class torrentParser extends AbstractParser implements Parser {
location.getHost(),
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
filenames.toString(),
null,
null,
@ -127,7 +127,7 @@ public class torrentParser extends AbstractParser implements Parser {
try {
byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser();
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, new ByteArrayInputStream(b));
Document[] d = parser.parse(new DigestURL("http://localhost/test.torrent"), null, StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false, 0);
Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);

@ -43,6 +43,7 @@ import java.util.List;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
@ -69,7 +70,7 @@ public class vcfParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -235,11 +236,11 @@ public class vcfParser extends AbstractParser implements Parser {
null, // set of languages
null, // a list of extracted keywords
singleList(parsedTitle.toString()), // a long document title
"", // TODO: AUTHOR
null, // TODO: AUTHOR
"", // the publisher
sections, // an array of section headlines
descriptions, // an abstract
0.0f, 0.0f,
0.0d, 0.0d,
text, // the parsed document text
anchors, // a map of extracted anchors
null,

@ -33,7 +33,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -69,7 +69,7 @@ public class vsdParser extends AbstractParser implements Parser {
*/
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -31,7 +31,6 @@ import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Date;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -70,7 +69,7 @@ public class xlsParser extends AbstractParser implements Parser {
*/
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
@ -129,11 +128,11 @@ public class xlsParser extends AbstractParser implements Parser {
null,
null,
singleList(location.getFile()),
"", // TODO: AUTHOR
null, // TODO: AUTHOR
"", // TODO: publisher
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
contents,
null,
null,

@ -67,7 +67,7 @@ public class zipParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final AnchorURL location,
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,

@ -560,7 +560,7 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
*/
public final Map<AnchorURL, String> loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent, final int timezoneOffset) throws IOException {
public final Map<AnchorURL, String> loadLinks(final DigestURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent, final int timezoneOffset) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent);
if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader();

@ -3755,42 +3755,57 @@ public final class Switchboard extends serverSwitch {
}.start();
}
public final void heuristicSearchResults(final String url) {
/**
* Get the outbound links of the result and add each unique link to crawler queue
* Is input resulturl a full index document with outboundlinks these will be used
* otherwise url is loaded and links are extracted/parsed
*
* @param resulturl the result doc which outbound links to add to crawler
*/
public final void heuristicSearchResults(final URIMetadataNode resulturl) {
new Thread() {
@Override
public void run() {
// get the links for a specific site
final AnchorURL startUrl;
try {
startUrl = new AnchorURL(url);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
return;
}
final DigestURL startUrl = resulturl.url();
// result might be rich metadata, try to get outbout links directly from result
Set<DigestURL> urls;
Iterator<String> outlinkit = URIMetadataNode.getLinks(resulturl, false);
if (outlinkit.hasNext()) {
urls = new HashSet<DigestURL>();
while (outlinkit.hasNext()) {
try {
urls.add(new DigestURL(outlinkit.next()));
} catch (MalformedURLException ex) { }
}
} else { // otherwise get links from loader
urls = null;
final Map<AnchorURL, String> links;
DigestURL url;
try {
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent, 0);
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<AnchorURL> i = links.keySet().iterator();
final boolean globalcrawljob = Switchboard.this.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL,false);
Collection<DigestURL> urls = new ArrayList<DigestURL>();
while (i.hasNext()) {
url = i.next();
boolean islocal = (url.getHost() == null && startUrl.getHost() == null) || (url.getHost() != null && startUrl.getHost() != null && url.getHost().contentEquals(startUrl.getHost()));
// add all external links or links to different page to crawler
if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) {
urls.add(url);
try {
final Map<AnchorURL, String> links;
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent, 0);
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<AnchorURL> i = links.keySet().iterator();
if (urls == null) urls = new HashSet<DigestURL>();
while (i.hasNext()) {
DigestURL url = i.next();
boolean islocal = (url.getHost() == null && startUrl.getHost() == null) || (url.getHost() != null && startUrl.getHost() != null && url.getHost().contentEquals(startUrl.getHost()));
// add all external links or links to different page to crawler
if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) {
urls.add(url);
}
}
}
addToCrawler(urls, globalcrawljob);
}
}
} catch (final Throwable e) {
} catch (final Throwable e) { }
}
if (urls != null && urls.size() > 0) {
final boolean globalcrawljob = Switchboard.this.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL,false);
addToCrawler(urls, globalcrawljob);
}
}
}.start();

Loading…
Cancel
Save