You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
169 lines
6.5 KiB
169 lines
6.5 KiB
|
|
package net.yacy.search.snippet;
|
|
|
|
import java.net.MalformedURLException;
|
|
import net.yacy.cora.document.encoding.ASCII;
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.cora.federate.yacy.CacheStrategy;
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.cora.util.CommonPattern;
|
|
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
|
import net.yacy.search.query.QueryGoal;
|
|
import net.yacy.search.schema.CollectionSchema;
|
|
import org.apache.solr.common.SolrDocument;
|
|
import static org.junit.Assert.*;
|
|
import org.junit.Before;
|
|
import org.junit.Test;
|
|
|
|
|
|
public class TextSnippetTest {
|
|
|
|
// declare some required parameter
|
|
final CacheStrategy cacheStrategy = CacheStrategy.CACHEONLY;
|
|
final boolean pre = true;
|
|
final int snippetMaxLength = 220;
|
|
final boolean reindexing = false;
|
|
|
|
SolrDocument doc;
|
|
|
|
public TextSnippetTest() {
|
|
}
|
|
|
|
@Before
|
|
public void setUp() throws Exception {
|
|
|
|
// prepare a empty test document
|
|
doc = new SolrDocument();
|
|
DigestURL url = new DigestURL("http://localhost/page.html");
|
|
doc.addField(CollectionSchema.id.name(), ASCII.String(url.hash()));
|
|
doc.addField(CollectionSchema.sku.name(), url.toNormalform(false));
|
|
// for testcases add other fields
|
|
// fields involved in snippet extraction:
|
|
// url, title, keywords, author, text_t
|
|
}
|
|
|
|
@Test
|
|
public void testTextSnippet() {
|
|
|
|
URIMetadataNode testpage = new URIMetadataNode(doc);
|
|
testpage.addField(CollectionSchema.title.name(), "New test case");
|
|
testpage.addField(CollectionSchema.keywords.name(), "junit");
|
|
testpage.addField(CollectionSchema.author.name(), "test author");
|
|
testpage.addField(CollectionSchema.text_t.name(), "A new testcase has been introduced. "
|
|
+ "It includes a few test lines and one line that should match.");
|
|
|
|
String querywords = "testcase line";
|
|
QueryGoal qg = new QueryGoal(querywords);
|
|
HandleSet queryhashes = qg.getIncludeHashes();
|
|
|
|
TextSnippet ts = new TextSnippet(
|
|
null,
|
|
testpage,
|
|
queryhashes,
|
|
cacheStrategy,
|
|
pre,
|
|
snippetMaxLength,
|
|
reindexing
|
|
);
|
|
String rstr = ts.getError();
|
|
assertEquals("testTextSnippet Error Code: ", "", rstr);
|
|
|
|
String[] wordlist = CommonPattern.SPACE.split(querywords);
|
|
rstr = ts.toString();
|
|
System.out.println("testTextSnippet: query=" + querywords);
|
|
System.out.println("testTextSnippet: snippet=" + rstr);
|
|
// check words included in snippet
|
|
for (String word : wordlist) {
|
|
assertTrue("testTextSnippet word included " + word, rstr.contains(word));
|
|
}
|
|
|
|
}
|
|
|
|
/**
|
|
* Test of getLineMarked method, of class TextSnippet.
|
|
*/
|
|
@Test
|
|
public void testGetLineMarked() {
|
|
URIMetadataNode testpage = new URIMetadataNode(doc);
|
|
testpage.addField(CollectionSchema.title.name(), "New test case");
|
|
testpage.addField(CollectionSchema.keywords.name(), "junit");
|
|
testpage.addField(CollectionSchema.author.name(), "test author");
|
|
testpage.addField(CollectionSchema.text_t.name(),
|
|
"A new testcase has been introduced. "
|
|
+ "It includes a few test lines and one line that should match.");
|
|
|
|
String querywords = "testcase line";
|
|
QueryGoal qg = new QueryGoal(querywords);
|
|
HandleSet queryhashes = qg.getIncludeHashes();
|
|
|
|
TextSnippet ts = new TextSnippet(
|
|
null,
|
|
testpage,
|
|
queryhashes,
|
|
cacheStrategy,
|
|
pre,
|
|
snippetMaxLength,
|
|
reindexing
|
|
);
|
|
|
|
String rstr = ts.getError();
|
|
assertEquals("testGetLineMarked Error Code: ", "", rstr);
|
|
|
|
// check words marked in snippet
|
|
rstr = ts.getLineMarked(qg);
|
|
System.out.println("testGetLineMarked: query=" + querywords);
|
|
System.out.println("testGetLineMarked: snippet=" + rstr);
|
|
String[] wordlist = CommonPattern.SPACE.split(querywords);
|
|
for (String wordstr : wordlist) {
|
|
assertTrue("testGetLineMarked marked word " + wordstr, rstr.contains("<b>" + wordstr + "</b>"));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Test of descriptionline method, of class TextSnippet.
|
|
* checking poper encoding of remaining html in raw snippet line.
|
|
*/
|
|
@Test
|
|
public void testDescriptionline() throws MalformedURLException {
|
|
String rawtestline = "Über großer test case </span> <pre> <hr><hr /></pre>"; // test line with html, risk of snippet format issue
|
|
|
|
DigestURL url = new DigestURL("http://localhost/page.html");
|
|
QueryGoal qg = new QueryGoal("test");
|
|
|
|
// test with raw line (no marking added by YaCy)
|
|
TextSnippet ts = new TextSnippet(
|
|
url.hash(),
|
|
rawtestline,
|
|
true, // isMarked,
|
|
TextSnippet.ResultClass.SOURCE_METADATA, "");
|
|
|
|
String sniptxt = ts.descriptionline(qg); // snippet text for display
|
|
System.out.println("testDescriptionline: snippet=" + sniptxt);
|
|
assertFalse ("HTML code not allowed in snippet text",sniptxt.contains("<pre>")); // display text not to include unwanted html
|
|
|
|
// test with marking of query word
|
|
ts = new TextSnippet(
|
|
url.hash(),
|
|
rawtestline,
|
|
false, // isMarked,
|
|
TextSnippet.ResultClass.SOURCE_METADATA, "");
|
|
|
|
sniptxt = ts.descriptionline(qg);
|
|
System.out.println("testDescriptionline: snippet=" + sniptxt);
|
|
assertFalse ("HTML code not allowed in snippet text",sniptxt.contains("<pre>")); // display text not to include unwanted html
|
|
assertTrue ("Query word not marked", sniptxt.contains("<b>test</b>")); // query word to be marked
|
|
|
|
// test text with some numbers (english/german format)
|
|
rawtestline = "Test Version 1.83 calculates pi to 3,14 always";
|
|
ts = new TextSnippet(
|
|
url.hash(),
|
|
rawtestline,
|
|
false, // isMarked,
|
|
TextSnippet.ResultClass.SOURCE_METADATA, "");
|
|
sniptxt = ts.descriptionline(qg);
|
|
System.out.println("testDescriptionline: (with numbers) snippet="+sniptxt);
|
|
assertTrue ("number (.) broken up",sniptxt.contains("1.83"));
|
|
assertTrue ("number (,) broken up",sniptxt.contains("3,14"));
|
|
}
|
|
}
|