diff --git a/source/net/yacy/document/AbstractParser.java b/source/net/yacy/document/AbstractParser.java index 15f300d7b..daf4a91a2 100644 --- a/source/net/yacy/document/AbstractParser.java +++ b/source/net/yacy/document/AbstractParser.java @@ -39,7 +39,8 @@ public abstract class AbstractParser implements Parser { protected final Set SUPPORTED_MIME_TYPES = new LinkedHashSet(); protected final Set SUPPORTED_EXTENSIONS = new HashSet(); private final String name; - + protected Object scraperObject; // used scraper or source object if any, otherwise null + /** * initialize a parser with a name * @param name diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 75b56606f..419dcb625 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -92,13 +92,13 @@ public class Document { private final Set languages; private boolean indexingDenied; private final double lon, lat; - private final Object parserObject; // the source object that was used to create the Document + private final Parser parserObject; // the source object that was used to create the Document private final Map> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document private final Date lastModified; private int crawldepth; public Document(final DigestURL location, final String mimeType, final String charset, - final Object parserObject, + final Parser parserObject, final Set languages, final String[] keywords, final List titles, @@ -160,11 +160,29 @@ public class Document { if (contentDomain != ContentDomain.ALL) return contentDomain; return this.dc_source().getContentDomainFromExt(); } - - public Object getParserObject() { + + /** + * The parser used to generate the document + * @return Parser + */ + public Parser getParserObject() { return this.parserObject; } + /** + * Confinient call to get the source/scraper object of the underlaying parser + * if the parser uses a scraper, like htmlParser + * @return scraper object typically of type ContentScraper but may also of type DCEntry + */ + public Object getScraperObject() { + if (this.parserObject instanceof AbstractParser) { + if (((AbstractParser) this.parserObject).scraperObject != null) { + return ((AbstractParser) this.parserObject).scraperObject; + } + } + return null; + } + public Set getContentLanguages() { return this.languages; } @@ -931,9 +949,9 @@ dc_rights // clean up parser data for (final Document doc: docs) { - Object parserObject = doc.getParserObject(); - if (parserObject instanceof ContentScraper) { - final ContentScraper html = (ContentScraper) parserObject; + Object scraper = doc.getScraperObject(); + if (scraper instanceof ContentScraper) { + final ContentScraper html = (ContentScraper) scraper; html.close(); } } @@ -979,9 +997,9 @@ dc_rights if (!entry.getKey().attachedNofollow()) result.put(entry.getKey(), entry.getValue()); } } - final Object parser = d.getParserObject(); - if (parser instanceof ContentScraper) { - final ContentScraper html = (ContentScraper) parser; + final Object scraper = d.getScraperObject(); + if (scraper instanceof ContentScraper) { + final ContentScraper html = (ContentScraper) scraper; String refresh = html.getRefreshPath(); if (refresh != null && refresh.length() > 0) try {result.put(new AnchorURL(refresh), "refresh");} catch (final MalformedURLException e) {} AnchorURL canonical = html.getCanonical(); diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index f60af57d3..8d54f93c3 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -45,6 +45,8 @@ import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Document; +import net.yacy.document.parser.genericParser; +import net.yacy.search.schema.CollectionSchema; public class DCEntry extends MultiMapSolrParams { @@ -330,11 +332,15 @@ public class DCEntry extends MultiMapSolrParams { languages.add(getLanguage()); List t = new ArrayList(1); t.add(getTitle()); + + // for processing during indexing, embed entry as source scraperObject in a standard parserobj object + genericParser parserobj = new genericParser(this); // init the simplest parser with DCEntry as source/scraperObject used during indexing + return new Document( getIdentifier(true), "text/html", StandardCharsets.UTF_8.name(), - this, + parserobj, languages, getSubject(), // might be null t, @@ -343,7 +349,7 @@ public class DCEntry extends MultiMapSolrParams { null, getDescriptions(), getLon(), getLat(), - get("text_t", ""), + get(CollectionSchema.text_t.name(), ""), null, null, null, diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index 0d6d64d6b..1d9f2d1c8 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -44,6 +44,17 @@ public class genericParser extends AbstractParser implements Parser { // this parser is used if no other fits. This parser fits all } + /** + * Constructor to allow to set a scraperObject + * because it is desired to keep the scraper/source object protected + * This is used for surrogate parsers to set a other source/scraper then ContentScraper + * @param scraper + */ + public genericParser(Object scraper) { + super("Generic Parser"); + this.scraperObject = scraper; + } + @Override public Document[] parse( final DigestURL location, diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index f668d3001..b293404bb 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -34,9 +34,7 @@ import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; import java.nio.charset.UnsupportedCharsetException; -import java.util.HashSet; import java.util.LinkedHashMap; -import java.util.Set; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; @@ -60,21 +58,29 @@ public class htmlParser extends AbstractParser implements Parser { private static final int maxLinks = 10000; - public final static String[] htmlExtensions = new String[]{ - "htm","html","shtml","shtm","stm","xhtml","phtml","phtm", - "tpl","php","php2","php3","php4","php5","cfm","asp","aspx","tex","txt","msg" - }; - - public final static Set htmlExtensionsSet; - - static { - htmlExtensionsSet = new HashSet<>(htmlExtensions.length); - for (String ext: htmlExtensions) htmlExtensionsSet.add(ext); - } - public htmlParser() { super("Streaming HTML Parser"); - this.SUPPORTED_EXTENSIONS.addAll(htmlExtensionsSet); + this.SUPPORTED_EXTENSIONS.add("htm"); + this.SUPPORTED_EXTENSIONS.add("html"); + this.SUPPORTED_EXTENSIONS.add("shtml"); + this.SUPPORTED_EXTENSIONS.add("shtm"); + this.SUPPORTED_EXTENSIONS.add("stm"); + this.SUPPORTED_EXTENSIONS.add("xhtml"); + this.SUPPORTED_EXTENSIONS.add("phtml"); + this.SUPPORTED_EXTENSIONS.add("phtm"); + this.SUPPORTED_EXTENSIONS.add("tpl"); + this.SUPPORTED_EXTENSIONS.add("php"); + this.SUPPORTED_EXTENSIONS.add("php2"); + this.SUPPORTED_EXTENSIONS.add("php3"); + this.SUPPORTED_EXTENSIONS.add("php4"); + this.SUPPORTED_EXTENSIONS.add("php5"); + this.SUPPORTED_EXTENSIONS.add("cfm"); + this.SUPPORTED_EXTENSIONS.add("asp"); + this.SUPPORTED_EXTENSIONS.add("aspx"); + this.SUPPORTED_EXTENSIONS.add("tex"); + this.SUPPORTED_EXTENSIONS.add("txt"); + this.SUPPORTED_EXTENSIONS.add("msg"); + this.SUPPORTED_MIME_TYPES.add("text/html"); this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml"); this.SUPPORTED_MIME_TYPES.add("application/xhtml+xml"); @@ -97,7 +103,8 @@ public class htmlParser extends AbstractParser implements Parser { try { // first get a document from the parsed html Charset[] detectedcharsetcontainer = new Charset[]{null}; - final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); + scraperObject = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); + ContentScraper scraper = (ContentScraper)scraperObject; // shortcut to access ContentScraper methodes // parseToScraper also detects/corrects/sets charset from html content tag final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); Document documentSnapshot = null; @@ -130,7 +137,7 @@ public class htmlParser extends AbstractParser implements Parser { * @param scraper * @return */ - private static Document transformScraper(final DigestURL location, final String mimeType, final String charSet, final ContentScraper scraper) { + private Document transformScraper(final DigestURL location, final String mimeType, final String charSet, final ContentScraper scraper) { final String[] sections = new String[ scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + @@ -150,7 +157,7 @@ public class htmlParser extends AbstractParser implements Parser { location, mimeType, charSet, - scraper, + this, scraper.getContentLanguages(), scraper.getKeywords(), scraper.getTitles(), @@ -178,7 +185,7 @@ public class htmlParser extends AbstractParser implements Parser { } catch (UnsupportedEncodingException e) { sourceStream = new ByteArrayInputStream(UTF8.getBytes(input)); } - ContentScraper scraper; + ContentScraper scraper; // for this static methode no need to init local this.scraperObject try { scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); } catch (Failure e) { @@ -242,6 +249,7 @@ public class htmlParser extends AbstractParser implements Parser { } // parsing the content + // for this static methode no need to init local this.scraperObject here final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); try { diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 81bd0473d..f0c7a163f 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -68,10 +68,9 @@ public class swfParser extends AbstractParser implements Parser { try { final SWF2HTML swf2html = new SWF2HTML(); String contents = ""; - ContentScraper htmlscraper=null; try { - contents = swf2html.convertSWFToHTML(source); - htmlscraper = htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100); + contents = swf2html.convertSWFToHTML(source); + scraperObject = htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100); } catch (final NegativeArraySizeException e) { throw new Parser.Failure(e.getMessage(), location); } catch (final IOException e) { @@ -79,29 +78,9 @@ public class swfParser extends AbstractParser implements Parser { } catch (final Exception e) { throw new Parser.Failure(e.getMessage(), location); } - /* - String url = null; - String urlnr = null; - final String linebreak = System.getProperty("line.separator"); - final List anchors = new ArrayList(); - int urls = 0; - int urlStart = -1; - int urlEnd = 0; - int p0 = 0; - //extracting urls - while ((urlStart = contents.indexOf("http://",urlEnd)) >= 0){ - urlEnd = contents.indexOf(linebreak,urlStart); - url = contents.substring(urlStart,urlEnd); - urlnr = Integer.toString(++urls); - AnchorURL u = new AnchorURL(url); - u.setNameProperty(urlnr); - anchors.add(u); - contents = contents.substring(0,urlStart)+contents.substring(urlEnd); - } - */ - - // As the result of parsing this function must return a plasmaParserDocument object + // As the result of parsing this function must return a plasmaParserDocument object + ContentScraper htmlscraper = (ContentScraper) this.scraperObject; // shortcut to access ContentScraper methodes return new Document[]{new Document( location, // url of the source document mimeType, // the documents mime type diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 6814f8737..be50a96bc 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -601,11 +601,14 @@ public class Segment { crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth() && !crawlProfile.snapshotsMustnotmatch().matcher(urlNormalform).matches()) { // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results - String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(); - if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) { + Parser p = document.getParserObject(); + boolean mimesupported = false; + if (p instanceof htmlParser) + mimesupported = ((htmlParser)p).supportedMimeTypes().contains(document.dc_format()); + + if (mimesupported) // STORE IMAGE AND METADATA Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, acceptLanguage); - } } // STORE TO SOLR diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index e621e22f4..7fda7067c 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -335,29 +335,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, md.laudio()); if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, md.lvideo()); if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, md.lapp()); - if (allAttr || contains(CollectionSchema.text_t)) { - // construct the text from other metadata parts. - // This is necessary here since that is used to search the link when no other data (parsed text body) is available - StringBuilder sb = new StringBuilder(120); - // accText(sb, md.dc_title()); // default search field via getQueryFields(), not needed for snippet (always displayed) - // accText(sb, md.dc_creator()); // author is in Default ranking/getQueryFields - // accText(sb, md.dc_publisher()); // has it's own metadata field publisher_t (not part of default queryfields) and mostly N/A - // accText(sb, md.snippet()); // above added to description_txt, default search field via getQueryFields(), description_txt incl. in snippet calculation - accText(sb, md.url().toTokens()); - // accText(sb, keywords); // default search field via getQueryFields(), keywords not incl. in snippet calculation - add(doc, CollectionSchema.text_t, sb.toString()); - } return doc; } - private static void accText(final StringBuilder sb, String text) { - if (text == null || text.length() == 0) return; - if (sb.length() != 0) sb.append(' '); - text = text.trim(); - if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); - } - public static class Subgraph { public final ArrayList[] urlProtocols, urlStubs, urlAnchorTexts; @SuppressWarnings("unchecked") @@ -541,11 +522,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); int c = 0; - final Object parser = document.getParserObject(); + final Object scraper = document.getScraperObject(); boolean containsCanonical = false; DigestURL canonical = null; - if (parser instanceof ContentScraper) { - final ContentScraper html = (ContentScraper) parser; + if (scraper instanceof ContentScraper) { + final ContentScraper html = (ContentScraper) scraper; List images = html.getImages(); // header tags @@ -885,9 +866,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } } - if (parser instanceof DCEntry) { + if (scraper instanceof DCEntry) { // the document was created with a surrogate parsing; overwrite all md: -entries to Solr - DCEntry dcentry = (DCEntry) parser; + DCEntry dcentry = (DCEntry) scraper; for (Map.Entry entry: dcentry.getMap().entrySet()) { String tag = entry.getKey(); if (!tag.startsWith("md:") || tag.length() < 4) continue;