From bfb0d4c69b893a6d1c12bc4bafa5e552778072ea Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 9 Oct 2012 20:02:58 +0200 Subject: [PATCH] - add language detection from tag - add jaudiotagger jar to Netbeans-IDE project classpath --- nbproject/project.xml | 2 +- source/net/yacy/document/parser/html/ContentScraper.java | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/nbproject/project.xml b/nbproject/project.xml index 0fd350051..c77cd1991 100644 --- a/nbproject/project.xml +++ b/nbproject/project.xml @@ -77,7 +77,7 @@ source htroot - lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/apache-solr-core-3.6.1.jar;lib/apache-solr-solrj-3.6.1.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.6.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.1.jar;lib/fontbox-1.7.0.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-r05.jar;lib/htmllexer.jar;lib/htmlparser.jar;lib/httpclient-4.2.1.jar;lib/httpcore-4.2.2.jar;lib/httpmime-4.2.1.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.6.1.jar;lib/jempbox-1.7.0.jar;lib/jena-2.6.4.jar;lib/jetty-6.1.26-patched-JETTY-1340.jar;lib/jetty-util-6.1.26-patched-JETTY-1340.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.16.jar;lib/log4j-over-slf4j-1.6.1.jar;lib/lucene-analyzers-3.6.1.jar;lib/lucene-core-3.6.1.jar;lib/lucene-highlighter-3.6.1.jar;lib/lucene-phonetic-3.6.1.jar;lib/lucene-spatial-3.6.1.jar;lib/lucene-spellchecker-3.6.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.0.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.6.1.jar;lib/slf4j-jdk14-1.6.1.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar + lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/apache-solr-core-3.6.1.jar;lib/apache-solr-solrj-3.6.1.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.6.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.1.jar;lib/fontbox-1.7.0.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-r05.jar;lib/htmllexer.jar;lib/htmlparser.jar;lib/httpclient-4.2.1.jar;lib/httpcore-4.2.2.jar;lib/httpmime-4.2.1.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.6.1.jar;lib/jempbox-1.7.0.jar;lib/jena-2.6.4.jar;lib/jetty-6.1.26-patched-JETTY-1340.jar;lib/jetty-util-6.1.26-patched-JETTY-1340.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.16.jar;lib/log4j-over-slf4j-1.6.1.jar;lib/lucene-analyzers-3.6.1.jar;lib/lucene-core-3.6.1.jar;lib/lucene-highlighter-3.6.1.jar;lib/lucene-phonetic-3.6.1.jar;lib/lucene-spatial-3.6.1.jar;lib/lucene-spellchecker-3.6.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.0.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.6.1.jar;lib/slf4j-jdk14-1.6.1.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar 1.6 diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 2990de966..78e84a9dc 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -436,6 +436,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { mergeAnchors(src, tagopts /* with property "name" */); this.iframes.add(src); this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false)); + } else if (tagname.equalsIgnoreCase("html")) { + final String lang = tagopts.getProperty("lang", EMPTY_STRING); + if (!lang.isEmpty()) // fake a language meta to preserv detection from + this.metas.put("dc.language",lang.substring(0,2)); // fix found entries like "hu-hu" } // fire event