From 8a243500361bbba4a7a9977929c22a15c0f1d0b0 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 15 Apr 2009 10:26:24 +0000 Subject: [PATCH] - fix for join method with new generalized RWI data structure (caused by latest commit) - added more functions to mediawiki parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5806 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/mediawiki_p.java | 2 +- .../kelondro/text/ReferenceContainer.java | 166 ++++++++++-------- .../kelondro/text/ReferenceFactory.java | 4 +- .../WordReferenceFactory.java | 12 +- .../referencePrototype/WordReferenceRow.java | 1 + source/de/anomic/plasma/plasmaParser.java | 4 +- source/de/anomic/tools/mediawikiIndex.java | 82 +++++++-- 7 files changed, 167 insertions(+), 104 deletions(-) diff --git a/htroot/mediawiki_p.java b/htroot/mediawiki_p.java index 1e46c266f..3449da103 100644 --- a/htroot/mediawiki_p.java +++ b/htroot/mediawiki_p.java @@ -54,7 +54,7 @@ public class mediawiki_p { File dumpFile = new File(sb.getRootPath(), "DATA/HTCACHE/mediawiki/" + dump); if (!dumpFile.exists()) return post; mediawikiIndex.checkIndex(dumpFile); - mediawikiIndex.wikirecord w = mediawikiIndex.find(title.replaceAll(" ", "_"), mediawikiIndex.idxFromWikimediaXML(dumpFile)); + mediawikiIndex.wikisourcerecord w = mediawikiIndex.find(title.replaceAll(" ", "_"), mediawikiIndex.idxFromWikimediaXML(dumpFile)); if (w == null) { return post; } diff --git a/source/de/anomic/kelondro/text/ReferenceContainer.java b/source/de/anomic/kelondro/text/ReferenceContainer.java index 7f55a5c65..c04c2b632 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainer.java +++ b/source/de/anomic/kelondro/text/ReferenceContainer.java @@ -47,26 +47,26 @@ import de.anomic.kelondro.util.ByteBuffer; * This class extends the RowSet with methods for the handling of * special ReferenceRow Row entry objects. */ -public class ReferenceContainer extends RowSet { +public class ReferenceContainer extends RowSet { private String termHash; - private ReferenceFactory factory; + private ReferenceFactory factory; - public ReferenceContainer(final ReferenceFactory factory, final String termHash, final RowSet collection) { + public ReferenceContainer(final ReferenceFactory factory, final String termHash, final RowSet collection) { super(collection); this.factory = factory; this.termHash = termHash; } - public ReferenceContainer(final ReferenceFactory factory, final String termHash, final Row rowdef, final int objectCount) { + public ReferenceContainer(final ReferenceFactory factory, final String termHash, final Row rowdef, final int objectCount) { super(rowdef, objectCount); this.termHash = termHash; this.factory = factory; this.lastTimeWrote = 0; } - public ReferenceContainer topLevelClone() { - final ReferenceContainer newContainer = new ReferenceContainer(this.factory, this.termHash, this.rowdef, this.size()); + public ReferenceContainer topLevelClone() { + final ReferenceContainer newContainer = new ReferenceContainer(this.factory, this.termHash, this.rowdef, this.size()); newContainer.addAllUnique(this); return newContainer; } @@ -93,8 +93,8 @@ public class ReferenceContainer extends RowSet { this.addUnique(entry.toKelondroEntry()); } - public ReferenceContainer merge(final ReferenceContainer c) { - return new ReferenceContainer(this.factory, this.termHash, super.merge(c)); + public ReferenceContainer merge(final ReferenceContainer c) { + return new ReferenceContainer(this.factory, this.termHash, super.merge(c)); } public Reference put(final Reference entry) { @@ -119,13 +119,13 @@ public class ReferenceContainer extends RowSet { return true; } - public int putAllRecent(final ReferenceContainer c) { + public int putAllRecent(final ReferenceContainer c) { // adds all entries in c and checks every entry for double-occurrence // returns the number of new elements if (c == null) return 0; int x = 0; synchronized (c) { - final Iterator i = c.entries(); + final Iterator i = c.entries(); while (i.hasNext()) { try { if (putRecent(i.next())) x++; @@ -138,10 +138,10 @@ public class ReferenceContainer extends RowSet { return x; } - public RT get(final String urlHash) { + public ReferenceType get(final String urlHash) { final Row.Entry entry = this.get(urlHash.getBytes()); if (entry == null) return null; - return this.factory.produce(entry, false); + return this.factory.produceSlow(entry); } /** @@ -149,10 +149,10 @@ public class ReferenceContainer extends RowSet { * if the url hash was found, return the entry, but delete the entry from the container * if the entry was not found, return null. */ - public RT remove(final String urlHash) { + public ReferenceType remove(final String urlHash) { final Row.Entry entry = remove(urlHash.getBytes()); if (entry == null) return null; - return this.factory.produce(entry, false); + return this.factory.produceSlow(entry); } public int removeEntries(final Set urlHashes) { @@ -162,12 +162,12 @@ public class ReferenceContainer extends RowSet { return count; } - public Iterator entries() { + public Iterator entries() { // returns an iterator of indexRWIEntry objects return new entryIterator(); } - public class entryIterator implements Iterator { + public class entryIterator implements Iterator { Iterator rowEntryIterator; @@ -179,10 +179,10 @@ public class ReferenceContainer extends RowSet { return rowEntryIterator.hasNext(); } - public RT next() { + public ReferenceType next() { final Row.Entry rentry = rowEntryIterator.next(); if (rentry == null) return null; - return factory.produce(rentry, false); + return factory.produceSlow(rentry); } public void remove() { @@ -216,10 +216,10 @@ public class ReferenceContainer extends RowSet { containerMergeMethod = meth; } - public static ReferenceContainer joinExcludeContainers( - final ReferenceFactory factory, - final Collection> includeContainers, - final Collection> excludeContainers, + public static ReferenceContainer joinExcludeContainers( + final ReferenceFactory factory, + final Collection> includeContainers, + final Collection> excludeContainers, final int maxDistance) { // join a search result and return the joincount (number of pages after join) @@ -227,22 +227,22 @@ public class ReferenceContainer extends RowSet { if (includeContainers == null) return ReferenceContainer.emptyContainer(factory, null, 0); // join the result - final ReferenceContainer rcLocal = ReferenceContainer.joinContainers(factory, includeContainers, maxDistance); + final ReferenceContainer rcLocal = ReferenceContainer.joinContainers(factory, includeContainers, maxDistance); if (rcLocal == null) return ReferenceContainer.emptyContainer(factory, null, 0); - excludeContainers(rcLocal, excludeContainers); + excludeContainers(factory, rcLocal, excludeContainers); return rcLocal; } - public static ReferenceContainer joinContainers( - final ReferenceFactory factory, - final Collection> containers, + public static ReferenceContainer joinContainers( + final ReferenceFactory factory, + final Collection> containers, final int maxDistance) { // order entities by their size - final TreeMap> map = new TreeMap>(); - ReferenceContainer singleContainer; - final Iterator> i = containers.iterator(); + final TreeMap> map = new TreeMap>(); + ReferenceContainer singleContainer; + final Iterator> i = containers.iterator(); int count = 0; while (i.hasNext()) { // get next entity: @@ -262,7 +262,7 @@ public class ReferenceContainer extends RowSet { // the map now holds the search results in order of number of hits per word // we now must pairwise build up a conjunction of these sets Long k = map.firstKey(); // the smallest, which means, the one with the least entries - ReferenceContainer searchA, searchB, searchResult = map.remove(k); + ReferenceContainer searchA, searchB, searchResult = map.remove(k); while ((map.size() > 0) && (searchResult.size() > 0)) { // take the first element of map which is a result and combine it with result k = map.firstKey(); // the next smallest... @@ -279,14 +279,17 @@ public class ReferenceContainer extends RowSet { return searchResult; } - public static ReferenceContainer excludeContainers(ReferenceContainer pivot, final Collection> containers) { + public static ReferenceContainer excludeContainers( + final ReferenceFactory factory, + ReferenceContainer pivot, + final Collection> containers) { // check if there is any result if ((containers == null) || (containers.size() == 0)) return pivot; // no result, nothing found - final Iterator> i = containers.iterator(); + final Iterator> i = containers.iterator(); while (i.hasNext()) { - pivot = excludeDestructive(pivot, i.next()); + pivot = excludeDestructive(factory, pivot, i.next()); if ((pivot == null) || (pivot.size() == 0)) return null; } @@ -300,10 +303,10 @@ public class ReferenceContainer extends RowSet { return l; } - public static ReferenceContainer joinConstructive( - final ReferenceFactory factory, - final ReferenceContainer i1, - final ReferenceContainer i2, + public static ReferenceContainer joinConstructive( + final ReferenceFactory factory, + final ReferenceContainer i1, + final ReferenceContainer i2, final int maxDistance) { if ((i1 == null) || (i2 == null)) return null; if ((i1.size() == 0) || (i2.size() == 0)) return null; @@ -324,51 +327,52 @@ public class ReferenceContainer extends RowSet { return joinConstructiveByEnumeration(factory, i1, i2, maxDistance); } - private static ReferenceContainer joinConstructiveByTest( - final ReferenceFactory factory, - final ReferenceContainer small, - final ReferenceContainer large, + private static ReferenceContainer joinConstructiveByTest( + final ReferenceFactory factory, + final ReferenceContainer small, + final ReferenceContainer large, final int maxDistance) { System.out.println("DEBUG: JOIN METHOD BY TEST, maxdistance = " + maxDistance); assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString(); final int keylength = small.rowdef.width(0); assert (keylength == large.rowdef.width(0)); - final ReferenceContainer conj = new ReferenceContainer(factory, null, small.rowdef, 0); // start with empty search result - final Iterator se = small.entries(); - RT ie0; - RT ie1; + final ReferenceContainer conj = new ReferenceContainer(factory, null, small.rowdef, 0); // start with empty search result + final Iterator se = small.entries(); + ReferenceType ie1; + ReferenceType ie2; while (se.hasNext()) { - ie0 = se.next(); - ie1 = large.get(ie0.metadataHash()); - if ((ie0 != null) && (ie1 != null)) { - assert (ie0.metadataHash().length() == keylength) : "ie0.urlHash() = " + ie0.metadataHash(); - assert (ie1.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie1.metadataHash(); + ie1 = se.next(); + ie2 = large.get(ie1.metadataHash()); + if ((ie1 != null) && (ie2 != null)) { + assert (ie1.metadataHash().length() == keylength) : "ie0.urlHash() = " + ie1.metadataHash(); + assert (ie2.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie2.metadataHash(); // this is a hit. Calculate word distance: - ie0.join(ie1); - if (ie0.distance() <= maxDistance) conj.add(ie0); + ie1 = factory.produceFast(ie2); + ie1.join(ie2); + if (ie1.distance() <= maxDistance) conj.add(ie1); } } return conj; } - private static ReferenceContainer joinConstructiveByEnumeration( - final ReferenceFactory factory, - final ReferenceContainer i1, - final ReferenceContainer i2, + private static ReferenceContainer joinConstructiveByEnumeration( + final ReferenceFactory factory, + final ReferenceContainer i1, + final ReferenceContainer i2, final int maxDistance) { System.out.println("DEBUG: JOIN METHOD BY ENUMERATION, maxdistance = " + maxDistance); assert i1.rowdef.equals(i2.rowdef) : "i1 = " + i1.rowdef.toString() + "; i2 = " + i2.rowdef.toString(); final int keylength = i1.rowdef.width(0); assert (keylength == i2.rowdef.width(0)); - final ReferenceContainer conj = new ReferenceContainer(factory, null, i1.rowdef, 0); // start with empty search result + final ReferenceContainer conj = new ReferenceContainer(factory, null, i1.rowdef, 0); // start with empty search result if (!((i1.rowdef.getOrdering().signature().equals(i2.rowdef.getOrdering().signature())))) return conj; // ordering must be equal - final Iterator e1 = i1.entries(); - final Iterator e2 = i2.entries(); + final Iterator e1 = i1.entries(); + final Iterator e2 = i2.entries(); int c; if ((e1.hasNext()) && (e2.hasNext())) { - RT ie1; - RT ie2; + ReferenceType ie1; + ReferenceType ie2; ie1 = e1.next(); ie2 = e2.next(); @@ -383,6 +387,7 @@ public class ReferenceContainer extends RowSet { if (e2.hasNext()) ie2 = e2.next(); else break; } else { // we have found the same urls in different searches! + ie1 = factory.produceFast(ie1); ie1.join(ie2); if (ie1.distance() <= maxDistance) conj.add(ie1); if (e1.hasNext()) ie1 = e1.next(); else break; @@ -393,9 +398,10 @@ public class ReferenceContainer extends RowSet { return conj; } - public static ReferenceContainer excludeDestructive( - final ReferenceContainer pivot, - final ReferenceContainer excl) { + public static ReferenceContainer excludeDestructive( + final ReferenceFactory factory, + final ReferenceContainer pivot, + final ReferenceContainer excl) { if (pivot == null) return null; if (excl == null) return pivot; if (pivot.size() == 0) return null; @@ -411,17 +417,17 @@ public class ReferenceContainer extends RowSet { if (stepsEnum > stepsTest) { return excludeDestructiveByTest(pivot, excl); } - return excludeDestructiveByEnumeration(pivot, excl); + return excludeDestructiveByEnumeration(factory, pivot, excl); } - private static ReferenceContainer excludeDestructiveByTest( - final ReferenceContainer pivot, - final ReferenceContainer excl) { + private static ReferenceContainer excludeDestructiveByTest( + final ReferenceContainer pivot, + final ReferenceContainer excl) { assert pivot.rowdef.equals(excl.rowdef) : "small = " + pivot.rowdef.toString() + "; large = " + excl.rowdef.toString(); final int keylength = pivot.rowdef.width(0); assert (keylength == excl.rowdef.width(0)); final boolean iterate_pivot = pivot.size() < excl.size(); - final Iterator se = (iterate_pivot) ? pivot.entries() : excl.entries(); + final Iterator se = (iterate_pivot) ? pivot.entries() : excl.entries(); Reference ie0, ie1; while (se.hasNext()) { ie0 = se.next(); @@ -435,17 +441,20 @@ public class ReferenceContainer extends RowSet { return pivot; } - private static ReferenceContainer excludeDestructiveByEnumeration(final ReferenceContainer pivot, final ReferenceContainer excl) { + private static ReferenceContainer excludeDestructiveByEnumeration( + final ReferenceFactory factory, + final ReferenceContainer pivot, + final ReferenceContainer excl) { assert pivot.rowdef.equals(excl.rowdef) : "i1 = " + pivot.rowdef.toString() + "; i2 = " + excl.rowdef.toString(); final int keylength = pivot.rowdef.width(0); assert (keylength == excl.rowdef.width(0)); if (!((pivot.rowdef.getOrdering().signature().equals(excl.rowdef.getOrdering().signature())))) return pivot; // ordering must be equal - final Iterator e1 = pivot.entries(); - final Iterator e2 = excl.entries(); + final Iterator e1 = pivot.entries(); + final Iterator e2 = excl.entries(); int c; if ((e1.hasNext()) && (e2.hasNext())) { - RT ie1; - RT ie2; + ReferenceType ie1; + ReferenceType ie2; ie1 = e1.next(); ie2 = e2.next(); @@ -460,6 +469,7 @@ public class ReferenceContainer extends RowSet { if (e2.hasNext()) ie2 = e2.next(); else break; } else { // we have found the same urls in different searches! + ie1 = factory.produceFast(ie1); ie1.join(ie2); e1.remove(); if (e1.hasNext()) ie1 = e1.next(); else break; @@ -479,12 +489,12 @@ public class ReferenceContainer extends RowSet { } - public static final ByteBuffer compressIndex(final ReferenceContainer inputContainer, final ReferenceContainer excludeContainer, final long maxtime) { + public static final ByteBuffer compressIndex(final ReferenceContainer inputContainer, final ReferenceContainer excludeContainer, final long maxtime) { // collect references according to domains final long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; final TreeMap doms = new TreeMap(); synchronized (inputContainer) { - final Iterator i = inputContainer.entries(); + final Iterator i = inputContainer.entries(); Reference iEntry; String dom, paths; while (i.hasNext()) { @@ -519,7 +529,7 @@ public class ReferenceContainer extends RowSet { return bb; } - public static final void decompressIndex(final TreeMap target, ByteBuffer ci, final String peerhash) { + public static final void decompressIndex(final TreeMap target, ByteBuffer ci, final String peerhash) { // target is a mapping from url-hashes to a string of peer-hashes if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) { //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString()); diff --git a/source/de/anomic/kelondro/text/ReferenceFactory.java b/source/de/anomic/kelondro/text/ReferenceFactory.java index 5317390eb..4306f922a 100644 --- a/source/de/anomic/kelondro/text/ReferenceFactory.java +++ b/source/de/anomic/kelondro/text/ReferenceFactory.java @@ -30,6 +30,8 @@ import de.anomic.kelondro.index.Row; public interface ReferenceFactory { - public ReferenceType produce(Row.Entry e, boolean fast); + public ReferenceType produceSlow(Row.Entry e); + + public ReferenceType produceFast(ReferenceType e); } diff --git a/source/de/anomic/kelondro/text/referencePrototype/WordReferenceFactory.java b/source/de/anomic/kelondro/text/referencePrototype/WordReferenceFactory.java index b88b697a5..7142a2fc4 100644 --- a/source/de/anomic/kelondro/text/referencePrototype/WordReferenceFactory.java +++ b/source/de/anomic/kelondro/text/referencePrototype/WordReferenceFactory.java @@ -31,11 +31,13 @@ import de.anomic.kelondro.text.ReferenceFactory; public class WordReferenceFactory implements ReferenceFactory { - public WordReference produce(Entry e, boolean fast) { - if (fast) - return new WordReferenceVars(new WordReferenceRow(e)); - else - return new WordReferenceRow(e); + public WordReference produceSlow(Entry e) { + return new WordReferenceRow(e); + } + + public WordReference produceFast(WordReference r) { + if (r instanceof WordReferenceVars) return r; + return new WordReferenceVars(r); } } diff --git a/source/de/anomic/kelondro/text/referencePrototype/WordReferenceRow.java b/source/de/anomic/kelondro/text/referencePrototype/WordReferenceRow.java index a573c611b..79b128aad 100644 --- a/source/de/anomic/kelondro/text/referencePrototype/WordReferenceRow.java +++ b/source/de/anomic/kelondro/text/referencePrototype/WordReferenceRow.java @@ -293,4 +293,5 @@ public final class WordReferenceRow extends AbstractReference implements WordRef throw new UnsupportedOperationException(""); } + } diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index d73318942..8e0d74444 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -632,7 +632,7 @@ public final class plasmaParser { // testing if parsing is supported for this resource if (!plasmaParser.supportedContent(location,mimeType)) { - final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; + final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (1)"; theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg,location, "wrong mime type or wrong extension"); } @@ -654,7 +654,7 @@ public final class plasmaParser { } else if (HTMLParsableMimeTypesContains(mimeType)) { doc = parseHtml(location, mimeType, documentCharset, sourceStream); } else { - final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; + final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)"; theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg,location, "wrong mime type or wrong extension"); } diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java index 643f92650..ee88e831c 100644 --- a/source/de/anomic/tools/mediawikiIndex.java +++ b/source/de/anomic/tools/mediawikiIndex.java @@ -40,6 +40,7 @@ import java.io.InputStream; import java.io.PrintWriter; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; @@ -53,6 +54,10 @@ import java.util.concurrent.TimeoutException; import de.anomic.data.wiki.wikiCode; import de.anomic.data.wiki.wikiParser; import de.anomic.kelondro.util.ByteBuffer; +import de.anomic.plasma.plasmaParser; +import de.anomic.plasma.plasmaParserDocument; +import de.anomic.plasma.parser.ParserException; +import de.anomic.yacy.yacyURL; /* * this class provides data structures to read a mediawiki dump file in xml format @@ -68,6 +73,18 @@ public class mediawikiIndex { private static final byte[] pagestartb = pagestart.getBytes(); private static final byte[] pageendb = pageend.getBytes(); + private wikiParser wparser; + private plasmaParser hparser; + + public mediawikiIndex(String baseURL) throws MalformedURLException { + yacyURL u = new yacyURL(baseURL, null); + wparser = new wikiCode(u.getHost()); + hparser = new plasmaParser(); + // must be called before usage: + //plasmaParser.initHTMLParsableMimeTypes("text/html"); + //plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html"); + } + public static void checkIndex(File wikimediaxml) { File idx = idxFromWikimediaXML(wikimediaxml); if (idx.exists()) return; @@ -136,20 +153,20 @@ public class mediawikiIndex { private static class indexProducer implements Callable { - private BlockingQueue entries; + private BlockingQueue entries; PrintWriter out; - private static wikirecord poison = new wikirecord("", 0, 0); + private static wikisourcerecord poison = new wikisourcerecord("", 0, 0); int count; public indexProducer(int bufferCount, File indexFile) throws IOException { - entries = new ArrayBlockingQueue(bufferCount); + entries = new ArrayBlockingQueue(bufferCount); out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile))); count = 0; out.println(""); } - public void consume(wikirecord b) { + public void consume(wikisourcerecord b) { try { entries.put(b); } catch (InterruptedException e) { @@ -158,7 +175,7 @@ public class mediawikiIndex { } public Integer call() { - wikirecord r; + wikisourcerecord r; try { while(true) { r = entries.take(); @@ -205,7 +222,7 @@ public class mediawikiIndex { } public Integer call() { - wikirecord r; + wikisourcerecord r; wikiraw c; try { while(true) { @@ -215,7 +232,7 @@ public class mediawikiIndex { break; } try { - r = new wikirecord(c.b, c.start, c.end); + r = new wikisourcerecord(c.b, c.start, c.end); producer.consume(r); System.out.println("consumer / record start: " + r.start + ", title : " + r.title); count++; @@ -240,15 +257,15 @@ public class mediawikiIndex { } } - public static class wikirecord { + public static class wikisourcerecord { public long start, end; public String title; - public wikirecord(String title, long start, long end) { + public wikisourcerecord(String title, long start, long end) { this.title = title; this.start = start; this.end = end; } - public wikirecord(byte[] chunk, long start, long end) { + public wikisourcerecord(byte[] chunk, long start, long end) { String s; try { s = new String(chunk, "UTF-8"); @@ -271,6 +288,28 @@ public class mediawikiIndex { this.end = end; } } + public wikiparserrecord newRecord(String title, StringBuffer sb) { + return new wikiparserrecord(title, sb); + } + + public class wikiparserrecord { + public String title; + StringBuffer source; + String html; + yacyURL url; + plasmaParserDocument document; + public wikiparserrecord(String title, StringBuffer sb) { + this.title = title; + this.source = sb; + } + public void genHTML() throws MalformedURLException { + html = wparser.transform(source.toString()); + url = new yacyURL("http://de.wikipedia.org/wiki/" + title, null); + } + public void genDocument() throws InterruptedException, ParserException { + document = hparser.parseSource(url, "text/html", "utf-8", html.getBytes()); + } + } private static class PositionAwareReader { @@ -336,7 +375,7 @@ public class mediawikiIndex { return b; } - public static wikirecord find(String title, File f) throws IOException { + public static wikisourcerecord find(String title, File f) throws IOException { PositionAwareReader in = new PositionAwareReader(f); long start; String m = "" + title + ""; @@ -363,7 +402,7 @@ public class mediawikiIndex { if (q < 0) return null; int length = Integer.parseInt(s.substring(p, q)); //System.out.println("start = " + start + ", length = " + length); - return new wikirecord(title, start, start + length); + return new wikisourcerecord(title, start, start + length); } } return null; @@ -394,8 +433,10 @@ public class mediawikiIndex { StringBuffer sb = new StringBuffer(); boolean page = false, text = false; String title = null; - wikiParser wparser = new wikiCode("de.wikipedia.org"); - //plasmaParser hparser = new plasmaParser(); + plasmaParser.initHTMLParsableMimeTypes("text/html"); + plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html"); + mediawikiIndex mi = new mediawikiIndex("http://de.wikipedia.org/wiki/"); + wikiparserrecord record; while ((t = r.readLine()) != null) { if (t.indexOf(pagestart) >= 0) { page = true; @@ -408,8 +449,15 @@ public class mediawikiIndex { if (t.indexOf(textend) >= 0) { text = false; System.out.println("Title: " + title); - System.out.println(wparser.transform(sb.toString())); - System.out.println(); + record = mi.newRecord(title, sb); + record.genHTML(); + try { + record.genDocument(); + System.out.println(new String(record.document.getTextBytes())); + System.out.println(); + } catch (InterruptedException e) { + } catch (ParserException e) { + } sb.setLength(0); continue; } @@ -454,7 +502,7 @@ public class mediawikiIndex { if (s[0].equals("-find")) { try { - wikirecord w = find(s[1], new File(s[2] + ".idx.xml")); + wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml")); if (w == null) { System.out.println("not found"); } else {