bugfixed utf-8 decoding and parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@346 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 63f9570d3a
commit 712fe9ef18

@ -121,8 +121,9 @@ public class CacheAdmin_p {
else {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
serverFileUtils.copy(file, os);
os.flush();
plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>";
info += "<b>HREF:</b><br>" + formatAnchor(document.getHyperlinks()) + "<br>";
info += "<b>MEDIA:</b><br>" + formatAnchor(document.getMedialinks()) + "<br>";
@ -130,7 +131,7 @@ public class CacheAdmin_p {
info += "<b>TEXT:</b><br><span class=\"small\">" + new String(scraper.getText()) + "</span><br>";
info += "<b>LINES:</b><br><span class=\"small\">";
String[] sentences = document.getSentences();
for (int i = 0; i < sentences.length; i++) info += sentences + "<br>";
for (int i = 0; i < sentences.length; i++) info += sentences[i] + "<br>";
info += "</span><br>";
}
} catch (Exception e) {

File diff suppressed because one or more lines are too long

@ -41,6 +41,7 @@
package de.anomic.htmlFilter;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
@ -178,4 +179,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
System.out.println("TEXT :" + new String(text.getBytes()));
}
public static void main(String[] args) {
String test = "Nokia kürzt bei Forschung und Entwicklung";
try {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
scraper.scrapeText(test.getBytes());
System.out.println(new String(scraper.getText()));
} catch (MalformedURLException e) {}
}
}

@ -149,8 +149,8 @@ public final class plasmaParser {
* @see #initMediaExt(String)
*/
static {
initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
"sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"));
initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
"sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"));
/* ===================================================
* initializing the parser object pool
@ -383,21 +383,21 @@ public final class plasmaParser {
private static void loadEnabledParserList() {
// loading a list of availabe parser from file
Properties prop = new Properties();
Properties prop = new Properties();
BufferedInputStream bufferedIn = null;
try {
prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("yacy.parser"))));
} catch (IOException e) {
System.err.println("ERROR: yacy.parser not found in settings path");
} finally {
try {
prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("yacy.parser"))));
} catch (IOException e) {
System.err.println("ERROR: yacy.parser not found in settings path");
} finally {
if (bufferedIn != null) try{ bufferedIn.close(); }catch(Exception e){}
}
// enable them ...
setEnabledParserList(prop.keySet());
}
private static void loadAvailableParserList() {
}
private static void loadAvailableParserList() {
try {
plasmaParser.availableParserList.clear();
@ -405,24 +405,24 @@ public final class plasmaParser {
String javaClassPath = System.getProperty("java.class.path");
// getting the current package name
String plasmaParserPkgName = plasmaParser.class.getPackage().getName() + ".parser";
String plasmaParserPkgName = plasmaParser.class.getPackage().getName() + ".parser";
serverLog.logInfo("PARSER","Searching for additional content parsers in package " + plasmaParserPkgName);
// getting an uri to the parser subpackage
String packageURI = plasmaParser.class.getResource("/"+plasmaParserPkgName.replace('.','/')).toString();
serverLog.logDebug("PARSER", "Parser directory is " + packageURI);
String packageURI = plasmaParser.class.getResource("/"+plasmaParserPkgName.replace('.','/')).toString();
serverLog.logDebug("PARSER", "Parser directory is " + packageURI);
// open the parser directory
File parserDir = new File(new URI(packageURI));
File parserDir = new File(new URI(packageURI));
if ((parserDir == null) || (!parserDir.exists()) || (!parserDir.isDirectory())) return;
/*
* loop through all subdirectories and test if we can
/*
* loop through all subdirectories and test if we can
* find an additional parser class
*/
File[] parserDirectories = parserDir.listFiles(parserDirectoryFilter);
if (parserDirectories == null) return;
for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) {
for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) {
File currentDir = parserDirectories[parserDirNr];
serverLog.logDebug("PARSER", "Searching in directory " + currentDir.toString());
String[] parserClasses = currentDir.list(parserFileNameFilter);
@ -432,7 +432,7 @@ public final class plasmaParser {
serverLog.logDebug("PARSER", "Testing parser class " + parserClasses[parserNr]);
String className = parserClasses[parserNr].substring(0,parserClasses[parserNr].indexOf(".class"));
String fullClassName = plasmaParserPkgName + "." + currentDir.getName() + "." + className;
try {
try {
// trying to load the parser class by its name
Class parserClass = Class.forName(fullClassName);
Object theParser = parserClass.newInstance();
@ -446,7 +446,7 @@ public final class plasmaParser {
throw new ParserException("Missing dependency detected: '" + neededLibx[libxId] + "'.");
}
}
}
}
// loading the list of mime-types that are supported by this parser class
Hashtable supportedMimeTypes = ((Parser)theParser).getSupportedMimeTypes();
@ -456,31 +456,31 @@ public final class plasmaParser {
availableParserList.put(mimeType,fullClassName);
serverLog.logInfo("PARSER", "Found functional parser for mimeType '" + mimeType + "'.");
}
} catch (Exception e) { /* we can ignore this for the moment */
} catch (Exception e) { /* we can ignore this for the moment */
serverLog.logWarning("PARSER", "Parser '" + className + "' doesn't work correctly and will be ignored.\n [" + e.getClass().getName() + "]: " + e.getMessage());
} catch (Error e) { /* we can ignore this for the moment */
} catch (Error e) { /* we can ignore this for the moment */
serverLog.logWarning("PARSER", "Parser '" + className + "' doesn't work correctly and will be ignored.\n [" + e.getClass().getName() + "]: " + e.getMessage());
}
}
}
}
} catch (Exception e) {
serverLog.logError("PARSER", "Unable to determine all installed parsers. " + e.getMessage());
}
}
public void close() {
}
}
public void close() {
// clearing the parser list
synchronized (this.enabledParserList) {
this.enabledParserList.clear();
}
this.enabledParserList.clear();
}
// closing the parser object pool
try {
this.theParserPool.close();
} catch (Exception e) { }
}
try {
this.theParserPool.close();
} catch (Exception e) { }
}
public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) {
@ -498,7 +498,6 @@ public final class plasmaParser {
// ... otherwise we make a html scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
hfos.write(source);
hfos.close();
return transformScraper(location, mimeType, scraper);
@ -660,22 +659,24 @@ public final class plasmaParser {
return v;
}
public static void main(String[] args) {
//javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
try {
File in = new File(args[0]);
File out = new File(args[1]);
plasmaParser theParser = new plasmaParser();
public static void main(String[] args) {
//javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
try {
File in = new File(args[0]);
//File out = new File(args[1]);
plasmaParser theParser = new plasmaParser();
theParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
theParser.initParseableMimeTypes("application/atom+xml,application/gzip,application/java-archive,application/msword,application/octet-stream,application/pdf,application/rdf+xml,application/rss+xml,application/rtf,application/x-gzip,application/x-tar,application/xml,application/zip,text/rss,text/rtf,text/xml,application/x-bzip2");
FileInputStream theInput = new FileInputStream(in);
ByteArrayOutputStream theOutput = new ByteArrayOutputStream();
serverFileUtils.copy(theInput, theOutput);
plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray());
//plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
byte[] theText = document.getText();
serverFileUtils.write(theText, out);
FileInputStream theInput = new FileInputStream(in);
ByteArrayOutputStream theOutput = new ByteArrayOutputStream();
serverFileUtils.copy(theInput, theOutput);
plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray());
//plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
//byte[] theText = document.getText();
//serverFileUtils.write(theText, out);
String[] sentences = document.getSentences();
for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
} catch (Exception e) {
e.printStackTrace();
}

@ -105,6 +105,7 @@ public class plasmaSnippetCache {
}
public result retrieve(URL url, Set queryhashes, boolean fetchOnline) {
// heise = "0OQUNU3JSs05"
if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for url retrieve " + url);
return new result(null, SOURCE_ERROR, "no query hashes given");
@ -250,7 +251,7 @@ public class plasmaSnippetCache {
} catch (IOException e) {}
if (header == null) {
String filename = url.getFile();
String filename = cacheManager.getCachePath(url).getName();
int p = filename.lastIndexOf('.');
if ((p < 0) ||
((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {

@ -371,6 +371,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38",
new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000);
}
// test routine for snippet fetch
// url = /www.heise.de/mobil/newsticker/meldung/mail/54980
Set query = new HashSet(); query.add("0OQUNU3JSs05"); // 'heise'
//plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true);
plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
}
private static String ppRamString(int bytes) {

@ -90,12 +90,12 @@ public final class serverFileUtils {
FileInputStream fis = null;
FileOutputStream fos = null;
try {
fis = new FileInputStream(source);
fos = new FileOutputStream(dest);
copy(fis, fos);
fis = new FileInputStream(source);
fos = new FileOutputStream(dest);
copy(fis, fos);
} finally {
if (fis != null) try {fis.close();} catch (Exception e) {}
if (fos != null) try {fos.close();} catch (Exception e) {}
if (fos != null) try {fos.close();} catch (Exception e) {}
}
}
@ -107,16 +107,16 @@ public final class serverFileUtils {
}
public static byte[] read(File source) throws IOException {
byte[] buffer = new byte[(int) source.length()];
InputStream fis = null;
try {
fis = new FileInputStream(source);
int p = 0, c;
while ((c = fis.read(buffer, p, buffer.length - p)) > 0) p += c;
} finally {
byte[] buffer = new byte[(int) source.length()];
InputStream fis = null;
try {
fis = new FileInputStream(source);
int p = 0, c;
while ((c = fis.read(buffer, p, buffer.length - p)) > 0) p += c;
} finally {
if (fis != null) try { fis.close(); } catch (Exception e) {}
}
return buffer;
}
return buffer;
}
public static byte[] readAndZip(File source) throws IOException {

@ -148,6 +148,10 @@ public final class yacy {
plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
// hardcoded, forced, temporary value-migration
sb.setConfig("htTemplatePath", "htroot/env/templates");
sb.setConfig("parseableExt", "html,htm,txt,php,shtml,asp");
// if we are running an SVN version, we try to detect the used svn revision now ...
if (vString.equals("@" + "REPL_VERSION" + "@")) {
Properties buildProp = new Properties();
@ -188,9 +192,6 @@ public final class yacy {
if (timeout < 60000) timeout = 60000;
int maxSessions = Integer.parseInt(sb.getConfig("httpdMaxSessions", "100"));
// hardcoded, forced, temporary value-migration
sb.setConfig("htTemplatePath", "htroot/env/templates");
// create some directories
File htRootPath = new File(sb.getRootPath(), sb.getConfig("htRootPath", "htroot"));
File htDocsPath = new File(sb.getRootPath(), sb.getConfig("htDocsPath", "DATA/HTDOCS"));

@ -100,7 +100,7 @@ parseableMimeTypes=
# this is important to recognize <a href> - tags as not-html reference
# These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
mediaExt=ace,arj,asf,avi,bin,bz2,css,deb,doc,dmg,gif,gz,hqx,img,iso,jar,jpe,jpg,jpeg,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,sit,swf,sxc,sxd,sxi,sxw,tar,tgz,torrent,wmv,xcf,xls,zip
parseableExt=html,htm,txt
parseableExt=html,htm,txt,php,shtml,asp
# Promotion Strings
# These strings appear in the Web Mask of the YACY search client

Loading…
Cancel
Save