diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 78a3087f0..ae2faf5b3 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -45,8 +45,11 @@ keywords ## character encoding, string charset_s +## tags of css entries, normalized with absolute URL, textgen +attr_css_tag + ## urls of css entries, normalized with absolute URL, textgen -attr_css +attr_css_url ## number of css entries, int csscount_i @@ -74,11 +77,24 @@ wordcount_i ## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen attr_inboundlinks_tag -attr_inboundlinks_protocol -attr_inboundlinks_urlstub -attr_inboundlinks_name -attr_inboundlinks_rel -attr_inboundlinks_text + +## internal links, only the protocol +#attr_inboundlinks_protocol + +## internal links, the url only without the protocol +#attr_inboundlinks_urlstub + +## internal links, the name property of the a-tag +#attr_inboundlinks_name + +## internal links, the rel property of the a-tag +#attr_inboundlinks_rel + +## internal links, the rel property of the a-tag, coded binary +#attr_inboundlinks_relcode + +## internal links, the text content of the a-tag +#attr_inboundlinks_text ## total number of inbound links, int inboundlinkscount_i @@ -88,18 +104,43 @@ inboundlinksnoindexcount_i ## external links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen attr_outboundlinks_tag -attr_outboundlinks_protocol -attr_outboundlinks_urlstub -attr_outboundlinks_name -attr_outboundlinks_rel -attr_outboundlinks_text -## total number of external links, int -outboundlinkscount_i +## external links, only the protocol +#attr_outboundlinks_protocol + +## external links, the url only without the protocol +#attr_outboundlinks_urlstub + +## external links, the name property of the a-tag +#attr_outboundlinks_name + +## external links, the rel property of the a-tag +#attr_outboundlinks_rel + +## external links, the text content of the a-tag +#attr_outboundlinks_text + +## external number of inbound links, int +outboundlinks_i ## number of external links with noindex tag, int outboundlinksnoindexcount_i +## all image tags, encoded as tag inclusive alt- and title property, textgen +attr_images_tag + +## all image links without the protocol and '://' +#attr_images_urlstub + +## all image link protocols +#attr_images_protocol + +## all image link alt tag +#attr_images_alt + +## number of images, int +imagescount_i + ## h1 header, textgen attr_h1 @@ -154,12 +195,6 @@ attr_italiccount ## total number of occurrences of , int italic_i -## all image tags, encoded as tag inclusive alt- and title property, textgen -attr_images - -## number of images, int -imagescount_i - ## flag that shows if a swf file is linked, boolean flash_b @@ -205,6 +240,12 @@ attr_tracker ## number of attribute counts in attr_tracker, textgen attr_trackercount +## names matching title expressions, textgen +attr_title + +## number of matching title expressions, textgen +attr_titlecount + ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text failreason_t diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java index 063780ae7..ab8909d0b 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java @@ -128,76 +128,75 @@ public class SolrScheme extends ConfigurationSet { int c = 0; if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount()); if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount()); - if (isEmpty() || contains("attr_inboundlinks")) { - final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()]; - final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()]; - final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()]; - final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()]; - final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()]; - final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()]; - for (final MultiProtocolURI url: yacydoc.inboundLinks()) { - final Properties p = alllinks.get(url); - final String name = p.getProperty("name", ""); // the name attribute - final String rel = p.getProperty("rel", ""); // the rel-attribute - final String text = p.getProperty("text", ""); // the text between the tag - final String urls = url.toNormalform(false, false); - final int pr = urls.indexOf("://"); - inboundlinksURLProtocol[c] = urls.substring(0, pr); - inboundlinksURLStub[c] = urls.substring(pr + 3); - inboundlinksName[c] = name.length() > 0 ? name : ""; - inboundlinksRel[c] = rel.length() > 0 ? rel : ""; - inboundlinksText[c] = text.length() > 0 ? rel : ""; - inboundlinksTag[c] = - " 0 ? " rel=\"" + rel + "\"" : "") + - ">" + - ((name.length() > 0) ? name : "") + ""; - c++; - } - addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag); - addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol); - addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub); - addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName); - addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel); - addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText); + final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()]; + for (final MultiProtocolURI url: yacydoc.inboundLinks()) { + final Properties p = alllinks.get(url); + final String name = p.getProperty("name", ""); // the name attribute + final String rel = p.getProperty("rel", ""); // the rel-attribute + final String text = p.getProperty("text", ""); // the text between the tag + final String urls = url.toNormalform(false, false); + final int pr = urls.indexOf("://"); + inboundlinksURLProtocol[c] = urls.substring(0, pr); + inboundlinksURLStub[c] = urls.substring(pr + 3); + inboundlinksName[c] = name.length() > 0 ? name : ""; + inboundlinksRel[c] = rel.length() > 0 ? rel : ""; + inboundlinksText[c] = text.length() > 0 ? text : ""; + inboundlinksTag[c] = + " 0 ? " rel=\"" + rel + "\"" : "") + + (name.length() > 0 ? " name=\"" + name + "\"" : "") + + ">" + + ((text.length() > 0) ? text : "") + ""; + c++; } + if (isEmpty() || contains("attr_inboundlinks_tag")) addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag); + if (isEmpty() || contains("attr_inboundlinks_protocol")) addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol); + if (isEmpty() || contains("attr_inboundlinks_urlstub")) addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub); + if (isEmpty() || contains("attr_inboundlinks_name")) addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName); + if (isEmpty() || contains("attr_inboundlinks_rel")) addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel); + if (isEmpty() || contains("attr_inboundlinks_text")) addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText); c = 0; if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount()); if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount()); - if (isEmpty() || contains("attr_outboundlinks")) { - final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()]; - final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()]; - final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()]; - final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()]; - final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()]; - final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()]; - for (final MultiProtocolURI url: yacydoc.outboundLinks()) { - final Properties p = alllinks.get(url); - final String name = p.getProperty("name", ""); // the name attribute - final String rel = p.getProperty("rel", ""); // the rel-attribute - final String text = p.getProperty("text", ""); // the text between the tag - final String urls = url.toNormalform(false, false); - final int pr = urls.indexOf("://"); - outboundlinksURLProtocol[c] = urls.substring(0, pr); - outboundlinksURLStub[c] = urls.substring(pr + 3); - outboundlinksName[c] = name.length() > 0 ? name : ""; - outboundlinksRel[c] = rel.length() > 0 ? rel : ""; - outboundlinksText[c] = text.length() > 0 ? rel : ""; - outboundlinksTag[c] = - " 0 ? " rel=\"" + rel + "\"" : "") + - ">" + - ((name.length() > 0) ? name : "") + ""; - c++; - } - addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag); - addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol); - addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub); - addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName); - addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel); - addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText); + final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()]; + for (final MultiProtocolURI url: yacydoc.outboundLinks()) { + final Properties p = alllinks.get(url); + final String name = p.getProperty("name", ""); // the name attribute + final String rel = p.getProperty("rel", ""); // the rel-attribute + final String text = p.getProperty("text", ""); // the text between the tag + final String urls = url.toNormalform(false, false); + final int pr = urls.indexOf("://"); + outboundlinksURLProtocol[c] = urls.substring(0, pr); + outboundlinksURLStub[c] = urls.substring(pr + 3); + outboundlinksName[c] = name.length() > 0 ? name : ""; + outboundlinksRel[c] = rel.length() > 0 ? rel : ""; + outboundlinksText[c] = text.length() > 0 ? text : ""; + outboundlinksTag[c] = + " 0 ? " rel=\"" + rel + "\"" : "") + + (name.length() > 0 ? " name=\"" + name + "\"" : "") + + ">" + + ((text.length() > 0) ? text : "") + ""; + c++; } + if (isEmpty() || contains("attr_outboundlinks_tag")) addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag); + if (isEmpty() || contains("attr_outboundlinks_protocol")) addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol); + if (isEmpty() || contains("attr_outboundlinks_urlstub")) addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub); + if (isEmpty() || contains("attr_outboundlinks_name")) addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName); + if (isEmpty() || contains("attr_outboundlinks_rel")) addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel); + if (isEmpty() || contains("attr_outboundlinks_text")) addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText); + // charset addSolr(solrdoc, "charset_s", yacydoc.getCharset()); @@ -255,27 +254,43 @@ public class SolrScheme extends ConfigurationSet { if (li.length > 0) addSolr(solrdoc, "attr_li", li); // images - if (isEmpty() || contains("attr_images")) { - final Collection imagesc = html.getImages().values(); - final String[] images = new String[imagesc.size()]; - c = 0; - for (final ImageEntry ie: imagesc) images[c++] = ie.toString(); - addSolr(solrdoc, "imagescount_i", images.length); - if (images.length > 0) addSolr(solrdoc, "attr_images", images); + final Collection imagesc = html.getImages().values(); + final String[] imgtags = new String[imagesc.size()]; + final String[] imgprots = new String[imagesc.size()]; + final String[] imgstubs = new String[imagesc.size()]; + final String[] imgalts = new String[imagesc.size()]; + c = 0; + for (final ImageEntry ie: imagesc) { + final MultiProtocolURI uri = ie.url(); + imgtags[c] = ie.toString(); + imgprots[c] = uri.getProtocol(); + imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3); + imgalts[c] = ie.alt(); + c++; } + addSolr(solrdoc, "imagescount_i", imgtags.length); + if (isEmpty() || contains("attr_images_tag")) addSolr(solrdoc, "attr_images_tag", imgtags); + if (isEmpty() || contains("attr_images_protocol")) addSolr(solrdoc, "attr_images_protocol", imgprots); + if (isEmpty() || contains("attr_images_urlstub")) addSolr(solrdoc, "attr_images_urlstub", imgstubs); + if (isEmpty() || contains("attr_images_alt")) addSolr(solrdoc, "attr_images_alt", imgalts); // style sheets if (isEmpty() || contains("attr_css")) { final Map csss = html.getCSS(); - final String[] css = new String[csss.size()]; + final String[] css_tag = new String[csss.size()]; + final String[] css_url = new String[csss.size()]; c = 0; for (final Map.Entry entry: csss.entrySet()) { - css[c++] = + final String url = entry.getKey().toNormalform(false, false, false, false); + css_tag[c] = ""; + " href=\""+ url + "\" />"; + css_url[c] = url; + c++; } - addSolr(solrdoc, "csscount_i", css.length); - if (css.length > 0) addSolr(solrdoc, "attr_css", css); + addSolr(solrdoc, "csscount_i", css_tag.length); + if (css_tag.length > 0) addSolr(solrdoc, "attr_css_tag", css_tag); + if (css_url.length > 0) addSolr(solrdoc, "attr_css_url", css_url); } // Scripts diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 53755465b..d4f67ca5a 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -447,6 +447,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (h.length() > 0) this.headlines[5].add(h); } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) { this.title = recursiveParse(text); + this.evaluationScores.match(Element.title, this.title); } else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) { h = recursiveParse(text); if (h.length() > 0) this.bold.inc(h); diff --git a/source/net/yacy/document/parser/html/Evaluation.java b/source/net/yacy/document/parser/html/Evaluation.java index 006c75060..b2e2a00bd 100644 --- a/source/net/yacy/document/parser/html/Evaluation.java +++ b/source/net/yacy/document/parser/html/Evaluation.java @@ -62,6 +62,7 @@ public class Evaluation { public static enum Element { text, + title, bodyclass, divid, csspath, diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index dc2dc966b..c610c2d63 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -11,12 +11,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -34,8 +34,6 @@ import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.util.regex.Pattern; -import com.ibm.icu.text.CharsetDetector; - import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.document.AbstractParser; @@ -47,47 +45,49 @@ import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.util.FileUtils; +import com.ibm.icu.text.CharsetDetector; + public class htmlParser extends AbstractParser implements Parser { private static final Pattern patternUnderline = Pattern.compile("_"); public htmlParser() { - super("HTML Parser"); - SUPPORTED_EXTENSIONS.add("htm"); - SUPPORTED_EXTENSIONS.add("html"); - SUPPORTED_EXTENSIONS.add("phtml"); - SUPPORTED_EXTENSIONS.add("shtml"); - SUPPORTED_EXTENSIONS.add("xhtml"); - SUPPORTED_EXTENSIONS.add("php"); - SUPPORTED_EXTENSIONS.add("php3"); - SUPPORTED_EXTENSIONS.add("php4"); - SUPPORTED_EXTENSIONS.add("php5"); - SUPPORTED_EXTENSIONS.add("cfm"); - SUPPORTED_EXTENSIONS.add("asp"); - SUPPORTED_EXTENSIONS.add("aspx"); - SUPPORTED_EXTENSIONS.add("tex"); - SUPPORTED_EXTENSIONS.add("txt"); + super("HTML Parser"); + this.SUPPORTED_EXTENSIONS.add("htm"); + this.SUPPORTED_EXTENSIONS.add("html"); + this.SUPPORTED_EXTENSIONS.add("phtml"); + this.SUPPORTED_EXTENSIONS.add("shtml"); + this.SUPPORTED_EXTENSIONS.add("xhtml"); + this.SUPPORTED_EXTENSIONS.add("php"); + this.SUPPORTED_EXTENSIONS.add("php3"); + this.SUPPORTED_EXTENSIONS.add("php4"); + this.SUPPORTED_EXTENSIONS.add("php5"); + this.SUPPORTED_EXTENSIONS.add("cfm"); + this.SUPPORTED_EXTENSIONS.add("asp"); + this.SUPPORTED_EXTENSIONS.add("aspx"); + this.SUPPORTED_EXTENSIONS.add("tex"); + this.SUPPORTED_EXTENSIONS.add("txt"); //SUPPORTED_EXTENSIONS.add("js"); - SUPPORTED_EXTENSIONS.add("jsp"); - SUPPORTED_EXTENSIONS.add("mf"); - SUPPORTED_EXTENSIONS.add("pl"); - SUPPORTED_EXTENSIONS.add("py"); - SUPPORTED_MIME_TYPES.add("text/html"); - SUPPORTED_MIME_TYPES.add("text/xhtml+xml"); - SUPPORTED_MIME_TYPES.add("application/xhtml+xml"); - SUPPORTED_MIME_TYPES.add("application/x-httpd-php"); - SUPPORTED_MIME_TYPES.add("application/x-tex"); - SUPPORTED_MIME_TYPES.add("text/plain"); - SUPPORTED_MIME_TYPES.add("text/sgml"); - SUPPORTED_MIME_TYPES.add("text/csv"); + this.SUPPORTED_EXTENSIONS.add("jsp"); + this.SUPPORTED_EXTENSIONS.add("mf"); + this.SUPPORTED_EXTENSIONS.add("pl"); + this.SUPPORTED_EXTENSIONS.add("py"); + this.SUPPORTED_MIME_TYPES.add("text/html"); + this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml"); + this.SUPPORTED_MIME_TYPES.add("application/xhtml+xml"); + this.SUPPORTED_MIME_TYPES.add("application/x-httpd-php"); + this.SUPPORTED_MIME_TYPES.add("application/x-tex"); + this.SUPPORTED_MIME_TYPES.add("text/plain"); + this.SUPPORTED_MIME_TYPES.add("text/sgml"); + this.SUPPORTED_MIME_TYPES.add("text/csv"); } - + public static ContentScraper parseToScraper( - final MultiProtocolURI location, - final String documentCharset, + final MultiProtocolURI location, + final String documentCharset, InputStream sourceStream) throws Parser.Failure, IOException { - + // make a scraper String charset = null; @@ -95,72 +95,72 @@ public class htmlParser extends AbstractParser implements Parser { if (documentCharset != null) { charset = patchCharsetEncoding(documentCharset); } - + // nothing found: try to find a meta-tag if (charset == null) { try { final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false); sourceStream = htmlFilter; charset = htmlFilter.detectCharset(); - } catch (IOException e1) { + } catch (final IOException e1) { throw new Parser.Failure("Charset error:" + e1.getMessage(), location); } } // the author didn't tell us the encoding, try the mozilla-heuristic if (charset == null) { - CharsetDetector det = new CharsetDetector(); + final CharsetDetector det = new CharsetDetector(); det.enableInputFilter(true); - InputStream detStream = new BufferedInputStream(sourceStream); + final InputStream detStream = new BufferedInputStream(sourceStream); det.setText(detStream); charset = det.detect().getName(); sourceStream = detStream; } - + // wtf? still nothing, just take system-standard if (charset == null) { charset = Charset.defaultCharset().name(); } - + Charset c; try { c = Charset.forName(charset); - } catch (IllegalCharsetNameException e) { + } catch (final IllegalCharsetNameException e) { c = Charset.defaultCharset(); - } catch (UnsupportedCharsetException e) { + } catch (final UnsupportedCharsetException e) { c = Charset.defaultCharset(); } - + // parsing the content - final ContentScraper scraper = new ContentScraper(location); + final ContentScraper scraper = new ContentScraper(location); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false); try { FileUtils.copy(sourceStream, writer, c); - } catch (IOException e) { + } catch (final IOException e) { throw new Parser.Failure("IO error:" + e.getMessage(), location); } finally { sourceStream.close(); writer.close(); } - //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); + //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); //serverFileUtils.copy(sourceFile, hfos); //hfos.close(); if (writer.binarySuspect()) { final String errorMsg = "Binary data found in resource"; - throw new Parser.Failure(errorMsg, location); + throw new Parser.Failure(errorMsg, location); } return scraper; } public Document[] parse( - final MultiProtocolURI location, - final String mimeType, - final String documentCharset, + final MultiProtocolURI location, + final String mimeType, + final String documentCharset, final InputStream sourceStream) throws Parser.Failure, InterruptedException { - + try { return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream)); - } catch (IOException e) { + } catch (final IOException e) { throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location); } } @@ -197,7 +197,7 @@ public class htmlParser extends AbstractParser implements Parser { scraper.getRSS(), scraper.getImages(), scraper.indexingDenied())}; - //scraper.close(); + //scraper.close(); for (final Document ppd: ppds) { ppd.setFavicon(scraper.getFavicon()); } @@ -214,10 +214,10 @@ public class htmlParser extends AbstractParser implements Parser { * @return patched encoding name */ public static String patchCharsetEncoding(String encoding) { - + // do nothing with null if ((encoding == null) || (encoding.length() < 3)) return null; - + // trim encoding string encoding = encoding.trim(); @@ -228,7 +228,7 @@ public class htmlParser extends AbstractParser implements Parser { // all other names but such with "windows" use uppercase if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7); if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman"; - + // fix wrong fill characters encoding = patternUnderline.matcher(encoding).replaceAll("-"); @@ -236,7 +236,7 @@ public class htmlParser extends AbstractParser implements Parser { if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8"; if (encoding.startsWith("US")) return "US-ASCII"; if (encoding.startsWith("KOI")) return "KOI8-R"; - + // patch missing '-' if (encoding.startsWith("windows") && encoding.length() > 7) { final char c = encoding.charAt(7); @@ -244,7 +244,7 @@ public class htmlParser extends AbstractParser implements Parser { encoding = "windows-" + encoding.substring(7); } } - + if (encoding.startsWith("ISO")) { // patch typos if (encoding.length() > 3) { @@ -256,11 +256,11 @@ public class htmlParser extends AbstractParser implements Parser { if (encoding.length() > 8) { final char c = encoding.charAt(8); if ((c >= '0') && (c <= '9')) { - encoding = encoding.substring(0, 8) + "-" + encoding.substring(8); - } + encoding = encoding.substring(0, 8) + "-" + encoding.substring(8); + } } } - + // patch wrong name if (encoding.startsWith("ISO-8559")) { // popular typo @@ -279,26 +279,26 @@ public class htmlParser extends AbstractParser implements Parser { return encoding; } - - public static void main(String[] args) { + + public static void main(final String[] args) { // test parsing of a url MultiProtocolURI url; try { url = new MultiProtocolURI(args[0]); - byte[] content = url.get(ClientIdentification.getUserAgent(), 3000); - Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content)); - String title = document[0].dc_title(); + final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000); + final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content)); + final String title = document[0].dc_title(); System.out.println(title); System.out.println(CharacterCoding.unicode2html(title, false)); - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { e.printStackTrace(); - } catch (IOException e) { + } catch (final IOException e) { e.printStackTrace(); - } catch (Parser.Failure e) { + } catch (final Parser.Failure e) { e.printStackTrace(); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { e.printStackTrace(); } } - + } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index c7246a50c..330579aad 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -147,7 +147,7 @@ public final class LoaderDispatcher { FileUtils.copy(b, tmp); tmp.renameTo(targetFile); } - + public Response load(final Request request, final CacheStrategy cacheStrategy, final boolean checkBlacklist) throws IOException { return load(request, cacheStrategy, protocolMaxFileSize(request.url()), checkBlacklist); } @@ -274,7 +274,7 @@ public final class LoaderDispatcher { if (response != null && response.getContent() != null) { // we got something. Now check if we want to store that to the cache // first check looks if we want to store the content to the cache - if (!crawlProfile.storeHTCache()) { + if (crawlProfile == null || !crawlProfile.storeHTCache()) { // no caching wanted. Thats ok, do not write any message return response; } @@ -294,7 +294,7 @@ public final class LoaderDispatcher { throw new IOException("Unsupported protocol '" + protocol + "' in url " + url); } - + private int protocolMaxFileSize(final DigestURI url) { if (url.isHTTP() || url.isHTTPS()) return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); diff --git a/startYACY.sh b/startYACY.sh index 47f9adbe8..481618c83 100755 --- a/startYACY.sh +++ b/startYACY.sh @@ -124,6 +124,8 @@ then then ENABLEHUGEPAGES=1 fi + # the G1 GC is on by default in Java7, so we try that here as well + # JAVA_ARGS="$JAVA_ARGS -XX:+UnlockExperimentalVMOptions -XX:+UseG1GC" elif [ $OS = "SunOS" ] then # the UseConcMarkSweepGC option caused a full CPU usage - bug on Darwin.