diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list
index 78a3087f0..ae2faf5b3 100644
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@@ -45,8 +45,11 @@ keywords
## character encoding, string
charset_s
+## tags of css entries, normalized with absolute URL, textgen
+attr_css_tag
+
## urls of css entries, normalized with absolute URL, textgen
-attr_css
+attr_css_url
## number of css entries, int
csscount_i
@@ -74,11 +77,24 @@ wordcount_i
## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen
attr_inboundlinks_tag
-attr_inboundlinks_protocol
-attr_inboundlinks_urlstub
-attr_inboundlinks_name
-attr_inboundlinks_rel
-attr_inboundlinks_text
+
+## internal links, only the protocol
+#attr_inboundlinks_protocol
+
+## internal links, the url only without the protocol
+#attr_inboundlinks_urlstub
+
+## internal links, the name property of the a-tag
+#attr_inboundlinks_name
+
+## internal links, the rel property of the a-tag
+#attr_inboundlinks_rel
+
+## internal links, the rel property of the a-tag, coded binary
+#attr_inboundlinks_relcode
+
+## internal links, the text content of the a-tag
+#attr_inboundlinks_text
## total number of inbound links, int
inboundlinkscount_i
@@ -88,18 +104,43 @@ inboundlinksnoindexcount_i
## external links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen
attr_outboundlinks_tag
-attr_outboundlinks_protocol
-attr_outboundlinks_urlstub
-attr_outboundlinks_name
-attr_outboundlinks_rel
-attr_outboundlinks_text
-## total number of external links, int
-outboundlinkscount_i
+## external links, only the protocol
+#attr_outboundlinks_protocol
+
+## external links, the url only without the protocol
+#attr_outboundlinks_urlstub
+
+## external links, the name property of the a-tag
+#attr_outboundlinks_name
+
+## external links, the rel property of the a-tag
+#attr_outboundlinks_rel
+
+## external links, the text content of the a-tag
+#attr_outboundlinks_text
+
+## external number of inbound links, int
+outboundlinks_i
## number of external links with noindex tag, int
outboundlinksnoindexcount_i
+## all image tags, encoded as
tag inclusive alt- and title property, textgen
+attr_images_tag
+
+## all image links without the protocol and '://'
+#attr_images_urlstub
+
+## all image link protocols
+#attr_images_protocol
+
+## all image link alt tag
+#attr_images_alt
+
+## number of images, int
+imagescount_i
+
## h1 header, textgen
attr_h1
@@ -154,12 +195,6 @@ attr_italiccount
## total number of occurrences of , int
italic_i
-## all image tags, encoded as
tag inclusive alt- and title property, textgen
-attr_images
-
-## number of images, int
-imagescount_i
-
## flag that shows if a swf file is linked, boolean
flash_b
@@ -205,6 +240,12 @@ attr_tracker
## number of attribute counts in attr_tracker, textgen
attr_trackercount
+## names matching title expressions, textgen
+attr_title
+
+## number of matching title expressions, textgen
+attr_titlecount
+
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
failreason_t
diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
index 063780ae7..ab8909d0b 100644
--- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
@@ -128,76 +128,75 @@ public class SolrScheme extends ConfigurationSet {
int c = 0;
if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
- if (isEmpty() || contains("attr_inboundlinks")) {
- final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
- final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
- final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
- final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
- final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
- final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
- for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
- final Properties p = alllinks.get(url);
- final String name = p.getProperty("name", ""); // the name attribute
- final String rel = p.getProperty("rel", ""); // the rel-attribute
- final String text = p.getProperty("text", ""); // the text between the tag
- final String urls = url.toNormalform(false, false);
- final int pr = urls.indexOf("://");
- inboundlinksURLProtocol[c] = urls.substring(0, pr);
- inboundlinksURLStub[c] = urls.substring(pr + 3);
- inboundlinksName[c] = name.length() > 0 ? name : "";
- inboundlinksRel[c] = rel.length() > 0 ? rel : "";
- inboundlinksText[c] = text.length() > 0 ? rel : "";
- inboundlinksTag[c] =
- " 0 ? " rel=\"" + rel + "\"" : "") +
- ">" +
- ((name.length() > 0) ? name : "") + "";
- c++;
- }
- addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
- addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
- addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
- addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
- addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
- addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
+ final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
+ final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
+ final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
+ final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
+ final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
+ final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
+ for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
+ final Properties p = alllinks.get(url);
+ final String name = p.getProperty("name", ""); // the name attribute
+ final String rel = p.getProperty("rel", ""); // the rel-attribute
+ final String text = p.getProperty("text", ""); // the text between the tag
+ final String urls = url.toNormalform(false, false);
+ final int pr = urls.indexOf("://");
+ inboundlinksURLProtocol[c] = urls.substring(0, pr);
+ inboundlinksURLStub[c] = urls.substring(pr + 3);
+ inboundlinksName[c] = name.length() > 0 ? name : "";
+ inboundlinksRel[c] = rel.length() > 0 ? rel : "";
+ inboundlinksText[c] = text.length() > 0 ? text : "";
+ inboundlinksTag[c] =
+ " 0 ? " rel=\"" + rel + "\"" : "") +
+ (name.length() > 0 ? " name=\"" + name + "\"" : "") +
+ ">" +
+ ((text.length() > 0) ? text : "") + "";
+ c++;
}
+ if (isEmpty() || contains("attr_inboundlinks_tag")) addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
+ if (isEmpty() || contains("attr_inboundlinks_protocol")) addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
+ if (isEmpty() || contains("attr_inboundlinks_urlstub")) addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
+ if (isEmpty() || contains("attr_inboundlinks_name")) addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
+ if (isEmpty() || contains("attr_inboundlinks_rel")) addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
+ if (isEmpty() || contains("attr_inboundlinks_text")) addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
c = 0;
if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
- if (isEmpty() || contains("attr_outboundlinks")) {
- final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
- final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
- final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
- final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
- final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
- final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
- for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
- final Properties p = alllinks.get(url);
- final String name = p.getProperty("name", ""); // the name attribute
- final String rel = p.getProperty("rel", ""); // the rel-attribute
- final String text = p.getProperty("text", ""); // the text between the tag
- final String urls = url.toNormalform(false, false);
- final int pr = urls.indexOf("://");
- outboundlinksURLProtocol[c] = urls.substring(0, pr);
- outboundlinksURLStub[c] = urls.substring(pr + 3);
- outboundlinksName[c] = name.length() > 0 ? name : "";
- outboundlinksRel[c] = rel.length() > 0 ? rel : "";
- outboundlinksText[c] = text.length() > 0 ? rel : "";
- outboundlinksTag[c] =
- " 0 ? " rel=\"" + rel + "\"" : "") +
- ">" +
- ((name.length() > 0) ? name : "") + "";
- c++;
- }
- addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
- addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
- addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
- addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
- addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
- addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
+ final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
+ final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
+ final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
+ final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
+ final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
+ final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
+ for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
+ final Properties p = alllinks.get(url);
+ final String name = p.getProperty("name", ""); // the name attribute
+ final String rel = p.getProperty("rel", ""); // the rel-attribute
+ final String text = p.getProperty("text", ""); // the text between the tag
+ final String urls = url.toNormalform(false, false);
+ final int pr = urls.indexOf("://");
+ outboundlinksURLProtocol[c] = urls.substring(0, pr);
+ outboundlinksURLStub[c] = urls.substring(pr + 3);
+ outboundlinksName[c] = name.length() > 0 ? name : "";
+ outboundlinksRel[c] = rel.length() > 0 ? rel : "";
+ outboundlinksText[c] = text.length() > 0 ? text : "";
+ outboundlinksTag[c] =
+ " 0 ? " rel=\"" + rel + "\"" : "") +
+ (name.length() > 0 ? " name=\"" + name + "\"" : "") +
+ ">" +
+ ((text.length() > 0) ? text : "") + "";
+ c++;
}
+ if (isEmpty() || contains("attr_outboundlinks_tag")) addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
+ if (isEmpty() || contains("attr_outboundlinks_protocol")) addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
+ if (isEmpty() || contains("attr_outboundlinks_urlstub")) addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
+ if (isEmpty() || contains("attr_outboundlinks_name")) addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
+ if (isEmpty() || contains("attr_outboundlinks_rel")) addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
+ if (isEmpty() || contains("attr_outboundlinks_text")) addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
+
// charset
addSolr(solrdoc, "charset_s", yacydoc.getCharset());
@@ -255,27 +254,43 @@ public class SolrScheme extends ConfigurationSet {
if (li.length > 0) addSolr(solrdoc, "attr_li", li);
// images
- if (isEmpty() || contains("attr_images")) {
- final Collection imagesc = html.getImages().values();
- final String[] images = new String[imagesc.size()];
- c = 0;
- for (final ImageEntry ie: imagesc) images[c++] = ie.toString();
- addSolr(solrdoc, "imagescount_i", images.length);
- if (images.length > 0) addSolr(solrdoc, "attr_images", images);
+ final Collection imagesc = html.getImages().values();
+ final String[] imgtags = new String[imagesc.size()];
+ final String[] imgprots = new String[imagesc.size()];
+ final String[] imgstubs = new String[imagesc.size()];
+ final String[] imgalts = new String[imagesc.size()];
+ c = 0;
+ for (final ImageEntry ie: imagesc) {
+ final MultiProtocolURI uri = ie.url();
+ imgtags[c] = ie.toString();
+ imgprots[c] = uri.getProtocol();
+ imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3);
+ imgalts[c] = ie.alt();
+ c++;
}
+ addSolr(solrdoc, "imagescount_i", imgtags.length);
+ if (isEmpty() || contains("attr_images_tag")) addSolr(solrdoc, "attr_images_tag", imgtags);
+ if (isEmpty() || contains("attr_images_protocol")) addSolr(solrdoc, "attr_images_protocol", imgprots);
+ if (isEmpty() || contains("attr_images_urlstub")) addSolr(solrdoc, "attr_images_urlstub", imgstubs);
+ if (isEmpty() || contains("attr_images_alt")) addSolr(solrdoc, "attr_images_alt", imgalts);
// style sheets
if (isEmpty() || contains("attr_css")) {
final Map csss = html.getCSS();
- final String[] css = new String[csss.size()];
+ final String[] css_tag = new String[csss.size()];
+ final String[] css_url = new String[csss.size()];
c = 0;
for (final Map.Entry entry: csss.entrySet()) {
- css[c++] =
+ final String url = entry.getKey().toNormalform(false, false, false, false);
+ css_tag[c] =
"";
+ " href=\""+ url + "\" />";
+ css_url[c] = url;
+ c++;
}
- addSolr(solrdoc, "csscount_i", css.length);
- if (css.length > 0) addSolr(solrdoc, "attr_css", css);
+ addSolr(solrdoc, "csscount_i", css_tag.length);
+ if (css_tag.length > 0) addSolr(solrdoc, "attr_css_tag", css_tag);
+ if (css_url.length > 0) addSolr(solrdoc, "attr_css_url", css_url);
}
// Scripts
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 53755465b..d4f67ca5a 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -447,6 +447,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (h.length() > 0) this.headlines[5].add(h);
} else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
this.title = recursiveParse(text);
+ this.evaluationScores.match(Element.title, this.title);
} else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.bold.inc(h);
diff --git a/source/net/yacy/document/parser/html/Evaluation.java b/source/net/yacy/document/parser/html/Evaluation.java
index 006c75060..b2e2a00bd 100644
--- a/source/net/yacy/document/parser/html/Evaluation.java
+++ b/source/net/yacy/document/parser/html/Evaluation.java
@@ -62,6 +62,7 @@ public class Evaluation {
public static enum Element {
text,
+ title,
bodyclass,
divid,
csspath,
diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java
index dc2dc966b..c610c2d63 100644
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
- *
+ *
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
- *
+ *
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see .
@@ -34,8 +34,6 @@ import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Pattern;
-import com.ibm.icu.text.CharsetDetector;
-
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.document.AbstractParser;
@@ -47,47 +45,49 @@ import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.util.FileUtils;
+import com.ibm.icu.text.CharsetDetector;
+
public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_");
public htmlParser() {
- super("HTML Parser");
- SUPPORTED_EXTENSIONS.add("htm");
- SUPPORTED_EXTENSIONS.add("html");
- SUPPORTED_EXTENSIONS.add("phtml");
- SUPPORTED_EXTENSIONS.add("shtml");
- SUPPORTED_EXTENSIONS.add("xhtml");
- SUPPORTED_EXTENSIONS.add("php");
- SUPPORTED_EXTENSIONS.add("php3");
- SUPPORTED_EXTENSIONS.add("php4");
- SUPPORTED_EXTENSIONS.add("php5");
- SUPPORTED_EXTENSIONS.add("cfm");
- SUPPORTED_EXTENSIONS.add("asp");
- SUPPORTED_EXTENSIONS.add("aspx");
- SUPPORTED_EXTENSIONS.add("tex");
- SUPPORTED_EXTENSIONS.add("txt");
+ super("HTML Parser");
+ this.SUPPORTED_EXTENSIONS.add("htm");
+ this.SUPPORTED_EXTENSIONS.add("html");
+ this.SUPPORTED_EXTENSIONS.add("phtml");
+ this.SUPPORTED_EXTENSIONS.add("shtml");
+ this.SUPPORTED_EXTENSIONS.add("xhtml");
+ this.SUPPORTED_EXTENSIONS.add("php");
+ this.SUPPORTED_EXTENSIONS.add("php3");
+ this.SUPPORTED_EXTENSIONS.add("php4");
+ this.SUPPORTED_EXTENSIONS.add("php5");
+ this.SUPPORTED_EXTENSIONS.add("cfm");
+ this.SUPPORTED_EXTENSIONS.add("asp");
+ this.SUPPORTED_EXTENSIONS.add("aspx");
+ this.SUPPORTED_EXTENSIONS.add("tex");
+ this.SUPPORTED_EXTENSIONS.add("txt");
//SUPPORTED_EXTENSIONS.add("js");
- SUPPORTED_EXTENSIONS.add("jsp");
- SUPPORTED_EXTENSIONS.add("mf");
- SUPPORTED_EXTENSIONS.add("pl");
- SUPPORTED_EXTENSIONS.add("py");
- SUPPORTED_MIME_TYPES.add("text/html");
- SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
- SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
- SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
- SUPPORTED_MIME_TYPES.add("application/x-tex");
- SUPPORTED_MIME_TYPES.add("text/plain");
- SUPPORTED_MIME_TYPES.add("text/sgml");
- SUPPORTED_MIME_TYPES.add("text/csv");
+ this.SUPPORTED_EXTENSIONS.add("jsp");
+ this.SUPPORTED_EXTENSIONS.add("mf");
+ this.SUPPORTED_EXTENSIONS.add("pl");
+ this.SUPPORTED_EXTENSIONS.add("py");
+ this.SUPPORTED_MIME_TYPES.add("text/html");
+ this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
+ this.SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
+ this.SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
+ this.SUPPORTED_MIME_TYPES.add("application/x-tex");
+ this.SUPPORTED_MIME_TYPES.add("text/plain");
+ this.SUPPORTED_MIME_TYPES.add("text/sgml");
+ this.SUPPORTED_MIME_TYPES.add("text/csv");
}
-
+
public static ContentScraper parseToScraper(
- final MultiProtocolURI location,
- final String documentCharset,
+ final MultiProtocolURI location,
+ final String documentCharset,
InputStream sourceStream) throws Parser.Failure, IOException {
-
+
// make a scraper
String charset = null;
@@ -95,72 +95,72 @@ public class htmlParser extends AbstractParser implements Parser {
if (documentCharset != null) {
charset = patchCharsetEncoding(documentCharset);
}
-
+
// nothing found: try to find a meta-tag
if (charset == null) {
try {
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
- } catch (IOException e1) {
+ } catch (final IOException e1) {
throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
}
}
// the author didn't tell us the encoding, try the mozilla-heuristic
if (charset == null) {
- CharsetDetector det = new CharsetDetector();
+ final CharsetDetector det = new CharsetDetector();
det.enableInputFilter(true);
- InputStream detStream = new BufferedInputStream(sourceStream);
+ final InputStream detStream = new BufferedInputStream(sourceStream);
det.setText(detStream);
charset = det.detect().getName();
sourceStream = detStream;
}
-
+
// wtf? still nothing, just take system-standard
if (charset == null) {
charset = Charset.defaultCharset().name();
}
-
+
Charset c;
try {
c = Charset.forName(charset);
- } catch (IllegalCharsetNameException e) {
+ } catch (final IllegalCharsetNameException e) {
c = Charset.defaultCharset();
- } catch (UnsupportedCharsetException e) {
+ } catch (final UnsupportedCharsetException e) {
c = Charset.defaultCharset();
}
-
+
// parsing the content
- final ContentScraper scraper = new ContentScraper(location);
+ final ContentScraper scraper = new ContentScraper(location);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
try {
FileUtils.copy(sourceStream, writer, c);
- } catch (IOException e) {
+ } catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally {
sourceStream.close();
writer.close();
}
- //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
+ //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
//serverFileUtils.copy(sourceFile, hfos);
//hfos.close();
if (writer.binarySuspect()) {
final String errorMsg = "Binary data found in resource";
- throw new Parser.Failure(errorMsg, location);
+ throw new Parser.Failure(errorMsg, location);
}
return scraper;
}
public Document[] parse(
- final MultiProtocolURI location,
- final String mimeType,
- final String documentCharset,
+ final MultiProtocolURI location,
+ final String mimeType,
+ final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
-
+
try {
return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
- } catch (IOException e) {
+ } catch (final IOException e) {
throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
}
}
@@ -197,7 +197,7 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getRSS(),
scraper.getImages(),
scraper.indexingDenied())};
- //scraper.close();
+ //scraper.close();
for (final Document ppd: ppds) {
ppd.setFavicon(scraper.getFavicon());
}
@@ -214,10 +214,10 @@ public class htmlParser extends AbstractParser implements Parser {
* @return patched encoding name
*/
public static String patchCharsetEncoding(String encoding) {
-
+
// do nothing with null
if ((encoding == null) || (encoding.length() < 3)) return null;
-
+
// trim encoding string
encoding = encoding.trim();
@@ -228,7 +228,7 @@ public class htmlParser extends AbstractParser implements Parser {
// all other names but such with "windows" use uppercase
if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7);
if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
-
+
// fix wrong fill characters
encoding = patternUnderline.matcher(encoding).replaceAll("-");
@@ -236,7 +236,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
if (encoding.startsWith("US")) return "US-ASCII";
if (encoding.startsWith("KOI")) return "KOI8-R";
-
+
// patch missing '-'
if (encoding.startsWith("windows") && encoding.length() > 7) {
final char c = encoding.charAt(7);
@@ -244,7 +244,7 @@ public class htmlParser extends AbstractParser implements Parser {
encoding = "windows-" + encoding.substring(7);
}
}
-
+
if (encoding.startsWith("ISO")) {
// patch typos
if (encoding.length() > 3) {
@@ -256,11 +256,11 @@ public class htmlParser extends AbstractParser implements Parser {
if (encoding.length() > 8) {
final char c = encoding.charAt(8);
if ((c >= '0') && (c <= '9')) {
- encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);
- }
+ encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);
+ }
}
}
-
+
// patch wrong name
if (encoding.startsWith("ISO-8559")) {
// popular typo
@@ -279,26 +279,26 @@ public class htmlParser extends AbstractParser implements Parser {
return encoding;
}
-
- public static void main(String[] args) {
+
+ public static void main(final String[] args) {
// test parsing of a url
MultiProtocolURI url;
try {
url = new MultiProtocolURI(args[0]);
- byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
- Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
- String title = document[0].dc_title();
+ final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
+ final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
+ final String title = document[0].dc_title();
System.out.println(title);
System.out.println(CharacterCoding.unicode2html(title, false));
- } catch (MalformedURLException e) {
+ } catch (final MalformedURLException e) {
e.printStackTrace();
- } catch (IOException e) {
+ } catch (final IOException e) {
e.printStackTrace();
- } catch (Parser.Failure e) {
+ } catch (final Parser.Failure e) {
e.printStackTrace();
- } catch (InterruptedException e) {
+ } catch (final InterruptedException e) {
e.printStackTrace();
}
}
-
+
}
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index c7246a50c..330579aad 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -147,7 +147,7 @@ public final class LoaderDispatcher {
FileUtils.copy(b, tmp);
tmp.renameTo(targetFile);
}
-
+
public Response load(final Request request, final CacheStrategy cacheStrategy, final boolean checkBlacklist) throws IOException {
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), checkBlacklist);
}
@@ -274,7 +274,7 @@ public final class LoaderDispatcher {
if (response != null && response.getContent() != null) {
// we got something. Now check if we want to store that to the cache
// first check looks if we want to store the content to the cache
- if (!crawlProfile.storeHTCache()) {
+ if (crawlProfile == null || !crawlProfile.storeHTCache()) {
// no caching wanted. Thats ok, do not write any message
return response;
}
@@ -294,7 +294,7 @@ public final class LoaderDispatcher {
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
}
-
+
private int protocolMaxFileSize(final DigestURI url) {
if (url.isHTTP() || url.isHTTPS())
return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
diff --git a/startYACY.sh b/startYACY.sh
index 47f9adbe8..481618c83 100755
--- a/startYACY.sh
+++ b/startYACY.sh
@@ -124,6 +124,8 @@ then
then
ENABLEHUGEPAGES=1
fi
+ # the G1 GC is on by default in Java7, so we try that here as well
+ # JAVA_ARGS="$JAVA_ARGS -XX:+UnlockExperimentalVMOptions -XX:+UseG1GC"
elif [ $OS = "SunOS" ]
then
# the UseConcMarkSweepGC option caused a full CPU usage - bug on Darwin.