commit
df51e4ef07
Binary file not shown.
@ -1,7 +0,0 @@
|
||||
This package is part of WebCAT (http://webcat.sourceforge.net/)
|
||||
|
||||
WebCAT was developed at the XLDB group of the Department of Informatics of the Faculty of Sciences of the University of Lisbon in Portugal.
|
||||
|
||||
WebCAT was written by Bruno Martins.
|
||||
|
||||
WebCAT is released under the BSD License. (http://www.opensource.org/licenses/bsd-license.php)
|
@ -1,113 +0,0 @@
|
||||
//swfParser.java
|
||||
//------------------------
|
||||
//part of YaCy
|
||||
//(C) by Michael Peter Christen; mc@yacy.net
|
||||
//first published on http://www.anomic.de
|
||||
//Frankfurt, Germany, 2005
|
||||
//
|
||||
//this file is contributed by Marc Nause
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
//This program is free software; you can redistribute it and/or modify
|
||||
//it under the terms of the GNU General Public License as published by
|
||||
//the Free Software Foundation; either version 2 of the License, or
|
||||
//(at your option) any later version.
|
||||
//
|
||||
//This program is distributed in the hope that it will be useful,
|
||||
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
//GNU General Public License for more details.
|
||||
//
|
||||
//You should have received a copy of the GNU General Public License
|
||||
//along with this program; if not, write to the Free Software
|
||||
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.document.parser;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.VocabularyScraper;
|
||||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import pt.tumba.parser.swf.SWF2HTML;
|
||||
|
||||
public class swfParser extends AbstractParser implements Parser {
|
||||
|
||||
public swfParser() {
|
||||
super("Adobe Flash Parser");
|
||||
this.SUPPORTED_EXTENSIONS.add("swf");
|
||||
this.SUPPORTED_MIME_TYPES.add("application/x-shockwave-flash");
|
||||
this.SUPPORTED_MIME_TYPES.add("application/x-shockwave-flash2-preview");
|
||||
this.SUPPORTED_MIME_TYPES.add("application/futuresplash");
|
||||
this.SUPPORTED_MIME_TYPES.add("image/vnd.rn-realflash");
|
||||
}
|
||||
|
||||
/*
|
||||
* parses the source documents and returns a plasmaParserDocument containing
|
||||
* all extracted information about the parsed document
|
||||
*/
|
||||
@Override
|
||||
public Document[] parse(
|
||||
final DigestURL location,
|
||||
final String mimeType,
|
||||
final String charset,
|
||||
final VocabularyScraper scraper,
|
||||
final int timezoneOffset,
|
||||
final InputStream source)
|
||||
throws Parser.Failure, InterruptedException
|
||||
{
|
||||
|
||||
try {
|
||||
final SWF2HTML swf2html = new SWF2HTML();
|
||||
String contents = "";
|
||||
try {
|
||||
contents = swf2html.convertSWFToHTML(source);
|
||||
scraperObject = htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100);
|
||||
} catch (final NegativeArraySizeException e) {
|
||||
throw new Parser.Failure(e.getMessage(), location);
|
||||
} catch (final IOException e) {
|
||||
throw new Parser.Failure(e.getMessage(), location);
|
||||
} catch (final Exception e) {
|
||||
throw new Parser.Failure(e.getMessage(), location);
|
||||
}
|
||||
|
||||
// As the result of parsing this function must return a plasmaParserDocument object
|
||||
ContentScraper htmlscraper = (ContentScraper) this.scraperObject; // shortcut to access ContentScraper methodes
|
||||
return new Document[]{new Document(
|
||||
location, // url of the source document
|
||||
mimeType, // the documents mime type
|
||||
StandardCharsets.UTF_8.name(), // charset of the document text
|
||||
this,
|
||||
htmlscraper.getContentLanguages(),
|
||||
htmlscraper.getKeywords(),
|
||||
htmlscraper.getTitles(),
|
||||
htmlscraper.getAuthor(),
|
||||
htmlscraper.getPublisher(),
|
||||
null, // sections
|
||||
htmlscraper.getDescriptions(),
|
||||
htmlscraper.getLon(), htmlscraper.getLat(),
|
||||
htmlscraper.getText(),
|
||||
htmlscraper.getAnchors(),
|
||||
htmlscraper.getRSS(),
|
||||
null, // images
|
||||
false,
|
||||
htmlscraper.getDate())};
|
||||
} catch (final Exception e) {
|
||||
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
||||
|
||||
// if an unexpected error occures just log the error and raise a new Parser.Failure
|
||||
final String errorMsg = "Unable to parse the swf document '" + location + "':" + e.getMessage();
|
||||
//AbstractParser.log.logSevere(errorMsg);
|
||||
throw new Parser.Failure(errorMsg, location);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,198 @@
|
||||
package net.yacy.search.index;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import net.yacy.cora.document.WordCache;
|
||||
import net.yacy.cora.document.encoding.UTF8;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.storage.HandleSet;
|
||||
import net.yacy.cora.util.CommonPattern;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.crawler.retrieval.Response;
|
||||
import net.yacy.document.Tokenizer;
|
||||
import net.yacy.document.VocabularyScraper;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
import net.yacy.kelondro.data.word.WordReferenceRow;
|
||||
import net.yacy.kelondro.rwi.ReferenceContainer;
|
||||
import net.yacy.kelondro.rwi.ReferenceFactory;
|
||||
import net.yacy.kelondro.rwi.TermSearch;
|
||||
import net.yacy.kelondro.util.Bitfield;
|
||||
import static net.yacy.search.index.Segment.catchallWord;
|
||||
import net.yacy.search.query.QueryGoal;
|
||||
import org.junit.AfterClass;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
public class SegmentTest {
|
||||
|
||||
static Segment index;
|
||||
|
||||
/**
|
||||
* Setup RWI index
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
@BeforeClass
|
||||
public static void setUpClass() throws IOException {
|
||||
// setup a index segment
|
||||
index = new Segment(new ConcurrentLog("SegmentTest"),
|
||||
new File("test/DATA/INDEX/webportal/SEGMENTS"),
|
||||
new File("test/DATA/INDEX/webportal/ARCHIVE"),
|
||||
null, null);
|
||||
|
||||
// connect RWI index
|
||||
index.connectRWI(10, 1024);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void tearDownClass() {
|
||||
index.close();
|
||||
ConcurrentLog.shutdown();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test of clear method (for RWI), of class Segment.
|
||||
*/
|
||||
@Test
|
||||
public void testClear() throws MalformedURLException, IOException, SpaceExceededException {
|
||||
DigestURL url = new DigestURL("http://test.org/test.html");
|
||||
int urlComps = MultiProtocolURL.urlComps(url.toNormalform(true)).length;
|
||||
int urlLength = url.toNormalform(true).length();
|
||||
|
||||
byte[] termHash = Word.word2hash("test");
|
||||
Word word = new Word(1, 1, 1);
|
||||
word.flags = new Bitfield(4); // flags must not be null
|
||||
|
||||
WordReferenceRow ientry = new WordReferenceRow(
|
||||
url.hash(), urlLength, urlComps, 0, 1, 1,
|
||||
System.currentTimeMillis(), System.currentTimeMillis(),
|
||||
UTF8.getBytes("en"), Response.DT_TEXT, 0, 0);
|
||||
ientry.setWord(word);
|
||||
|
||||
// add a dummy Word and WordReference
|
||||
index.termIndex.add(termHash, ientry);
|
||||
|
||||
// check index count
|
||||
long cnt = index.RWICount();
|
||||
assertTrue(cnt > 0);
|
||||
|
||||
index.clear();
|
||||
|
||||
// check index count after clear
|
||||
cnt = index.RWICount();
|
||||
assertTrue(cnt == 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to store a text to the rwi index. This was derived from the
|
||||
* Segment.storeDocument() procedure.
|
||||
*
|
||||
* @param text of the document
|
||||
* @throws IOException
|
||||
* @throws SpaceExceededException
|
||||
*/
|
||||
private void storeTestDocTextToTermIndex(DigestURL url, String text) throws IOException, SpaceExceededException {
|
||||
|
||||
// set a pseudo url for the simulated test document
|
||||
final String urlNormalform = url.toNormalform(true);
|
||||
String dc_title = "Test Document";
|
||||
// STORE PAGE INDEX INTO WORD INDEX DB
|
||||
// create a word prototype which is re-used for all entries
|
||||
if (index.termIndex != null) {
|
||||
final int outlinksSame = 0;
|
||||
final int outlinksOther = 0;
|
||||
final int urlLength = urlNormalform.length();
|
||||
final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
|
||||
final int wordsintitle = CommonPattern.SPACES.split(dc_title).length; // same calculation as for CollectionSchema.title_words_val
|
||||
|
||||
WordCache meaningLib = new WordCache(null);
|
||||
boolean doAutotagging = false;
|
||||
VocabularyScraper scraper = null;
|
||||
|
||||
Tokenizer t = new Tokenizer(url, text, meaningLib, doAutotagging, scraper);
|
||||
|
||||
// create a WordReference template
|
||||
final WordReferenceRow ientry = new WordReferenceRow(
|
||||
url.hash(), urlLength, urlComps, wordsintitle,
|
||||
t.RESULT_NUMB_WORDS, t.RESULT_NUMB_SENTENCES,
|
||||
System.currentTimeMillis(), System.currentTimeMillis(),
|
||||
UTF8.getBytes("en"), Response.DT_TEXT,
|
||||
outlinksSame, outlinksOther);
|
||||
|
||||
// add the words to rwi index
|
||||
Word wprop = null;
|
||||
byte[] wordhash;
|
||||
String word;
|
||||
for (Map.Entry<String, Word> wentry : t.words().entrySet()) {
|
||||
word = wentry.getKey();
|
||||
wprop = wentry.getValue();
|
||||
assert (wprop.flags != null);
|
||||
ientry.setWord(wprop);
|
||||
wordhash = Word.word2hash(word);
|
||||
if (this.index != null) {
|
||||
index.termIndex.add(wordhash, ientry);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Simulates a multi word query for the rwi termIndex
|
||||
*
|
||||
* @throws SpaceExceededException
|
||||
* @throws MalformedURLException
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void testQuery_MultiWordQuery() throws SpaceExceededException, MalformedURLException, IOException {
|
||||
|
||||
// creates one test url with this text in the rwi index
|
||||
DigestURL url = new DigestURL("http://test.org/test.html");
|
||||
storeTestDocTextToTermIndex(url, "One Two Three Four Five. This is a test text. One two three for five");
|
||||
|
||||
// create a query to get the search word hashsets
|
||||
QueryGoal qg = new QueryGoal("five test ");
|
||||
HandleSet queryHashes = qg.getIncludeHashes();
|
||||
HandleSet excludeHashes = qg.getExcludeHashes();
|
||||
HandleSet urlselection = null;
|
||||
ReferenceFactory<WordReference> termFactory = Segment.wordReferenceFactory;
|
||||
|
||||
// do the search
|
||||
TermSearch<WordReference> result = index.termIndex.query(queryHashes, excludeHashes, urlselection, termFactory, Integer.MAX_VALUE);
|
||||
|
||||
// get the joined resutls
|
||||
ReferenceContainer<WordReference> wc = result.joined();
|
||||
|
||||
// we should have now one result (stored to index above)
|
||||
assertTrue("test url hash in result set", wc.has(url.hash()));
|
||||
|
||||
// the returned WordReference is expected to be a joined Reference with properties set used in ranking
|
||||
Iterator<WordReference> it = wc.entries();
|
||||
System.out.println("-----------------");
|
||||
|
||||
// currently the results are not as expected for a multi-word query
|
||||
while (it.hasNext()) {
|
||||
WordReference r = it.next();
|
||||
// expected to be 1st in text
|
||||
System.out.println("posintext=" + r.positions() + " (expected=5)");
|
||||
// min position of search word in text
|
||||
System.out.println("minposition=" + r.minposition() + " (expected=5)");
|
||||
// max position of search word in text
|
||||
System.out.println("maxposition=" + r.maxposition() + " (expected=8)");
|
||||
// for a multiword query distance expected to be the avg of search word positions in text
|
||||
System.out.println("distance=" + r.distance() + " (expected=3)");
|
||||
// occurence of search words in text
|
||||
System.out.println("hitcount=" + r.hitcount() + " (expected=2)");
|
||||
}
|
||||
System.out.println("-----------------");
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue