eleminate dependency on file-extension in storeDocument but use supported mime-type

to also support handling of urls w/o corresponding file-extension.
For this refactor use of document.getParserObject() to alway return a Parser (for clean logic)
and define/move the scraperObject as local var of AbstractParser.
Adjust related calls to getParserObject (where actually a scraperObject is wanted).
Addionally skip appending url token to parsed text for dht metadata entries 
(by default returned as result by rwi index).
pull/67/head
reger 9 years ago
parent ebde21079a
commit 4c7a77662a

@ -39,7 +39,8 @@ public abstract class AbstractParser implements Parser {
protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>();
protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
private final String name;
protected Object scraperObject; // used scraper or source object if any, otherwise null
/**
* initialize a parser with a name
* @param name

@ -92,13 +92,13 @@ public class Document {
private final Set<String> languages;
private boolean indexingDenied;
private final double lon, lat;
private final Object parserObject; // the source object that was used to create the Document
private final Parser parserObject; // the source object that was used to create the Document
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
private final Date lastModified;
private int crawldepth;
public Document(final DigestURL location, final String mimeType, final String charset,
final Object parserObject,
final Parser parserObject,
final Set<String> languages,
final String[] keywords,
final List<String> titles,
@ -160,11 +160,29 @@ public class Document {
if (contentDomain != ContentDomain.ALL) return contentDomain;
return this.dc_source().getContentDomainFromExt();
}
public Object getParserObject() {
/**
* The parser used to generate the document
* @return Parser
*/
public Parser getParserObject() {
return this.parserObject;
}
/**
* Confinient call to get the source/scraper object of the underlaying parser
* if the parser uses a scraper, like htmlParser
* @return scraper object typically of type ContentScraper but may also of type DCEntry
*/
public Object getScraperObject() {
if (this.parserObject instanceof AbstractParser) {
if (((AbstractParser) this.parserObject).scraperObject != null) {
return ((AbstractParser) this.parserObject).scraperObject;
}
}
return null;
}
public Set<String> getContentLanguages() {
return this.languages;
}
@ -931,9 +949,9 @@ dc_rights
// clean up parser data
for (final Document doc: docs) {
Object parserObject = doc.getParserObject();
if (parserObject instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parserObject;
Object scraper = doc.getScraperObject();
if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper;
html.close();
}
}
@ -979,9 +997,9 @@ dc_rights
if (!entry.getKey().attachedNofollow()) result.put(entry.getKey(), entry.getValue());
}
}
final Object parser = d.getParserObject();
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
final Object scraper = d.getScraperObject();
if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper;
String refresh = html.getRefreshPath();
if (refresh != null && refresh.length() > 0) try {result.put(new AnchorURL(refresh), "refresh");} catch (final MalformedURLException e) {}
AnchorURL canonical = html.getCanonical();

@ -45,6 +45,8 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.Document;
import net.yacy.document.parser.genericParser;
import net.yacy.search.schema.CollectionSchema;
public class DCEntry extends MultiMapSolrParams {
@ -330,11 +332,15 @@ public class DCEntry extends MultiMapSolrParams {
languages.add(getLanguage());
List<String> t = new ArrayList<String>(1);
t.add(getTitle());
// for processing during indexing, embed entry as source scraperObject in a standard parserobj object
genericParser parserobj = new genericParser(this); // init the simplest parser with DCEntry as source/scraperObject used during indexing
return new Document(
getIdentifier(true),
"text/html",
StandardCharsets.UTF_8.name(),
this,
parserobj,
languages,
getSubject(), // might be null
t,
@ -343,7 +349,7 @@ public class DCEntry extends MultiMapSolrParams {
null,
getDescriptions(),
getLon(), getLat(),
get("text_t", ""),
get(CollectionSchema.text_t.name(), ""),
null,
null,
null,

@ -44,6 +44,17 @@ public class genericParser extends AbstractParser implements Parser {
// this parser is used if no other fits. This parser fits all
}
/**
* Constructor to allow to set a scraperObject
* because it is desired to keep the scraper/source object protected
* This is used for surrogate parsers to set a other source/scraper then ContentScraper
* @param scraper
*/
public genericParser(Object scraper) {
super("Generic Parser");
this.scraperObject = scraper;
}
@Override
public Document[] parse(
final DigestURL location,

@ -34,9 +34,7 @@ import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Set;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
@ -60,21 +58,29 @@ public class htmlParser extends AbstractParser implements Parser {
private static final int maxLinks = 10000;
public final static String[] htmlExtensions = new String[]{
"htm","html","shtml","shtm","stm","xhtml","phtml","phtm",
"tpl","php","php2","php3","php4","php5","cfm","asp","aspx","tex","txt","msg"
};
public final static Set<String> htmlExtensionsSet;
static {
htmlExtensionsSet = new HashSet<>(htmlExtensions.length);
for (String ext: htmlExtensions) htmlExtensionsSet.add(ext);
}
public htmlParser() {
super("Streaming HTML Parser");
this.SUPPORTED_EXTENSIONS.addAll(htmlExtensionsSet);
this.SUPPORTED_EXTENSIONS.add("htm");
this.SUPPORTED_EXTENSIONS.add("html");
this.SUPPORTED_EXTENSIONS.add("shtml");
this.SUPPORTED_EXTENSIONS.add("shtm");
this.SUPPORTED_EXTENSIONS.add("stm");
this.SUPPORTED_EXTENSIONS.add("xhtml");
this.SUPPORTED_EXTENSIONS.add("phtml");
this.SUPPORTED_EXTENSIONS.add("phtm");
this.SUPPORTED_EXTENSIONS.add("tpl");
this.SUPPORTED_EXTENSIONS.add("php");
this.SUPPORTED_EXTENSIONS.add("php2");
this.SUPPORTED_EXTENSIONS.add("php3");
this.SUPPORTED_EXTENSIONS.add("php4");
this.SUPPORTED_EXTENSIONS.add("php5");
this.SUPPORTED_EXTENSIONS.add("cfm");
this.SUPPORTED_EXTENSIONS.add("asp");
this.SUPPORTED_EXTENSIONS.add("aspx");
this.SUPPORTED_EXTENSIONS.add("tex");
this.SUPPORTED_EXTENSIONS.add("txt");
this.SUPPORTED_EXTENSIONS.add("msg");
this.SUPPORTED_MIME_TYPES.add("text/html");
this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
this.SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
@ -97,7 +103,8 @@ public class htmlParser extends AbstractParser implements Parser {
try {
// first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null};
final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
scraperObject = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
ContentScraper scraper = (ContentScraper)scraperObject; // shortcut to access ContentScraper methodes
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null;
@ -130,7 +137,7 @@ public class htmlParser extends AbstractParser implements Parser {
* @param scraper
* @return
*/
private static Document transformScraper(final DigestURL location, final String mimeType, final String charSet, final ContentScraper scraper) {
private Document transformScraper(final DigestURL location, final String mimeType, final String charSet, final ContentScraper scraper) {
final String[] sections = new String[
scraper.getHeadlines(1).length +
scraper.getHeadlines(2).length +
@ -150,7 +157,7 @@ public class htmlParser extends AbstractParser implements Parser {
location,
mimeType,
charSet,
scraper,
this,
scraper.getContentLanguages(),
scraper.getKeywords(),
scraper.getTitles(),
@ -178,7 +185,7 @@ public class htmlParser extends AbstractParser implements Parser {
} catch (UnsupportedEncodingException e) {
sourceStream = new ByteArrayInputStream(UTF8.getBytes(input));
}
ContentScraper scraper;
ContentScraper scraper; // for this static methode no need to init local this.scraperObject
try {
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
} catch (Failure e) {
@ -242,6 +249,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
// parsing the content
// for this static methode no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {

@ -68,10 +68,9 @@ public class swfParser extends AbstractParser implements Parser {
try {
final SWF2HTML swf2html = new SWF2HTML();
String contents = "";
ContentScraper htmlscraper=null;
try {
contents = swf2html.convertSWFToHTML(source);
htmlscraper = htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100);
contents = swf2html.convertSWFToHTML(source);
scraperObject = htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100);
} catch (final NegativeArraySizeException e) {
throw new Parser.Failure(e.getMessage(), location);
} catch (final IOException e) {
@ -79,29 +78,9 @@ public class swfParser extends AbstractParser implements Parser {
} catch (final Exception e) {
throw new Parser.Failure(e.getMessage(), location);
}
/*
String url = null;
String urlnr = null;
final String linebreak = System.getProperty("line.separator");
final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
int urls = 0;
int urlStart = -1;
int urlEnd = 0;
int p0 = 0;
//extracting urls
while ((urlStart = contents.indexOf("http://",urlEnd)) >= 0){
urlEnd = contents.indexOf(linebreak,urlStart);
url = contents.substring(urlStart,urlEnd);
urlnr = Integer.toString(++urls);
AnchorURL u = new AnchorURL(url);
u.setNameProperty(urlnr);
anchors.add(u);
contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
}
*/
// As the result of parsing this function must return a plasmaParserDocument object
// As the result of parsing this function must return a plasmaParserDocument object
ContentScraper htmlscraper = (ContentScraper) this.scraperObject; // shortcut to access ContentScraper methodes
return new Document[]{new Document(
location, // url of the source document
mimeType, // the documents mime type

@ -601,11 +601,14 @@ public class Segment {
crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth() &&
!crawlProfile.snapshotsMustnotmatch().matcher(urlNormalform).matches()) {
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {
Parser p = document.getParserObject();
boolean mimesupported = false;
if (p instanceof htmlParser)
mimesupported = ((htmlParser)p).supportedMimeTypes().contains(document.dc_format());
if (mimesupported)
// STORE IMAGE AND METADATA
Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, acceptLanguage);
}
}
// STORE TO SOLR

@ -335,29 +335,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, md.laudio());
if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, md.lvideo());
if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, md.lapp());
if (allAttr || contains(CollectionSchema.text_t)) {
// construct the text from other metadata parts.
// This is necessary here since that is used to search the link when no other data (parsed text body) is available
StringBuilder sb = new StringBuilder(120);
// accText(sb, md.dc_title()); // default search field via getQueryFields(), not needed for snippet (always displayed)
// accText(sb, md.dc_creator()); // author is in Default ranking/getQueryFields
// accText(sb, md.dc_publisher()); // has it's own metadata field publisher_t (not part of default queryfields) and mostly N/A
// accText(sb, md.snippet()); // above added to description_txt, default search field via getQueryFields(), description_txt incl. in snippet calculation
accText(sb, md.url().toTokens());
// accText(sb, keywords); // default search field via getQueryFields(), keywords not incl. in snippet calculation
add(doc, CollectionSchema.text_t, sb.toString());
}
return doc;
}
private static void accText(final StringBuilder sb, String text) {
if (text == null || text.length() == 0) return;
if (sb.length() != 0) sb.append(' ');
text = text.trim();
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
public static class Subgraph {
public final ArrayList<String>[] urlProtocols, urlStubs, urlAnchorTexts;
@SuppressWarnings("unchecked")
@ -541,11 +522,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
int c = 0;
final Object parser = document.getParserObject();
final Object scraper = document.getScraperObject();
boolean containsCanonical = false;
DigestURL canonical = null;
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper;
List<ImageEntry> images = html.getImages();
// header tags
@ -885,9 +866,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
if (parser instanceof DCEntry) {
if (scraper instanceof DCEntry) {
// the document was created with a surrogate parsing; overwrite all md: -entries to Solr
DCEntry dcentry = (DCEntry) parser;
DCEntry dcentry = (DCEntry) scraper;
for (Map.Entry<String, String[]> entry: dcentry.getMap().entrySet()) {
String tag = entry.getKey();
if (!tag.startsWith("md:") || tag.length() < 4) continue;

Loading…
Cancel
Save