- relaxed matching of string-search (this is now case-insensitive)

- added transport of string-search pattern to remote search protocol
- fixed a problem parsing snippets with a '-' inside

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7700 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 8fd4e8ea98
commit deda54d684

@ -13,7 +13,8 @@
<p>
With this settings you can activate or deactivate parsing of additional content-types based on their MIME-types.<br />
For a detailed description of the various MIME-types take a look at
<a href="http://www.iana.org/assignments/media-types/">http://www.iana.org/assignments/media-types/</a>
<a href="http://www.iana.org/assignments/media-types/">http://www.iana.org/assignments/media-types/</a>.</br>
If you want to test a specific parser you can do so using the <a href="ViewFile.html">File Viewer</a>.
</p>
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">

@ -35,6 +35,7 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import de.anomic.data.WorkTables;
import de.anomic.search.QueryParams;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
@ -51,7 +52,7 @@ public class Table_API_p {
int startRecord = 0;
int maximumRecords = 25;
Pattern query = Pattern.compile(".*");
Pattern query = QueryParams.catchall_pattern;
if (post != null && post.containsKey("startRecord")) startRecord = post.getInt("startRecord", 0);
if (post != null && post.containsKey("maximumRecords")) maximumRecords = post.getInt("maximumRecords", 0);
if (post != null && post.containsKey("query") && !post.get("query", "").isEmpty()) {
@ -63,7 +64,7 @@ public class Table_API_p {
prop.put("inline", (inline) ? 1 : 0);
Pattern typefilter = Pattern.compile(".*");
Pattern typefilter = QueryParams.catchall_pattern;
if (post != null && post.containsKey("filter") && post.get("filter", "").length() > 0) {
typefilter = Pattern.compile(post.get("filter", ".*"));
}

@ -102,8 +102,8 @@ public final class search {
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
final String prefer = post.get("prefer", "");
final String contentdom = post.get("contentdom", "text");
final String filter = post.get("filter", ".*");
final Pattern snippetPattern = Pattern.compile(post.get("snippet", ".*"));
final String filter = post.get("filter", ".*"); // a filter on the url
final Pattern snippetPattern = Pattern.compile(post.get("snippet", ".*")); // a filter on the snippet
String sitehash = post.get("sitehash", ""); if (sitehash.length() == 0) sitehash = null;
String authorhash = post.get("authorhash", ""); if (authorhash.length() == 0) authorhash = null;
String language = post.get("language", "");

@ -133,7 +133,7 @@ public final class QueryParams {
this.excludeHashes = Word.words2hashesHandles(cq[1]);
this.fullqueryHashes = Word.words2hashesHandles(cq[2]);
}
this.snippetMatcher = Pattern.compile(".*");
this.snippetMatcher = QueryParams.catchall_pattern;
this.ranking = ranking;
this.tenant = null;
this.maxDistance = Integer.MAX_VALUE;
@ -540,22 +540,25 @@ public final class QueryParams {
}
private static Pattern StringMatchPattern = Pattern.compile(".*?(\".*?\").*");
/**
* calculate a pattern to match with a string search
* @param query
* @return
*/
public static Pattern stringSearchPattern(String query) {
String p = "";
StringBuilder p = new StringBuilder(query.length());
p.append("(?iu)");
int seqc = 0;
while (query.length() > 0) {
Matcher m = StringMatchPattern.matcher(query);
if (!m.matches()) break;
p += ".*" + query.substring(m.start(1) + 1, m.end(1) - 1);
p.append(".*?").append(query.substring(m.start(1) + 1, m.end(1) - 1));
query = query.substring(m.end(1));
seqc++;
}
p += ".*";
return Pattern.compile(p);
if (seqc == 0) return QueryParams.catchall_pattern;
p.append(".*");
return Pattern.compile(p.toString());
}
public static void main(String[] args) {

@ -128,6 +128,7 @@ public final class SearchEvent {
QueryParams.hashSet2hashString(query.excludeHashes),
query.prefer,
query.urlMask,
query.snippetMatcher,
query.targetlang == null ? "" : query.targetlang,
query.sitehash == null ? "" : query.sitehash,
query.authorhash == null ? "" : query.authorhash,

@ -132,11 +132,23 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
private String error;
private ResultClass resultStatus;
public TextSnippet(final byte[] urlhash, final String line, final ResultClass errorCode, final String errortext) {
public TextSnippet(
final byte[] urlhash,
final String line,
final ResultClass errorCode,
final String errortext) {
init(urlhash, line, errorCode, errortext);
}
public TextSnippet(final LoaderDispatcher loader, final URIMetadataRow.Components comp, final HandleSet queryhashes, final CrawlProfile.CacheStrategy cacheStrategy, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) {
public TextSnippet(
final LoaderDispatcher loader,
final URIMetadataRow.Components comp,
final HandleSet queryhashes,
final CrawlProfile.CacheStrategy cacheStrategy,
final boolean pre,
final int snippetMaxLength,
final int maxDocLen,
final boolean reindexing) {
// heise = "0OQUNU3JSs05"
final DigestURI url = comp.url();
if (queryhashes.isEmpty()) {
@ -298,13 +310,14 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return resultStatus;
}
private final static Pattern splitPattern = Pattern.compile(" |-");
public String getLineMarked(final HandleSet queryHashes) {
if (line == null) return "";
if (queryHashes == null || queryHashes.isEmpty()) return line.trim();
if (line.endsWith(".")) line = line.substring(0, line.length() - 1);
final Iterator<byte[]> i = queryHashes.iterator();
byte[] h;
final String[] words = line.split(" ");
final String[] words = splitPattern.split(line);
while (i.hasNext()) {
h = i.next();
for (int j = 0; j < words.length; j++) {

@ -409,6 +409,7 @@ public final class yacyClient {
final String urlhashes,
final Pattern prefer,
final Pattern filter,
final Pattern snippet,
final String language,
final String sitehash,
final String authorhash,
@ -445,7 +446,7 @@ public final class yacyClient {
try {
result = new SearchResult(
yacyNetwork.basicRequestParts(Switchboard.getSwitchboard(), target.hash, crypt.randomSalt()),
mySeed, wordhashes, excludehashes, urlhashes, prefer, filter, language,
mySeed, wordhashes, excludehashes, urlhashes, prefer, filter, snippet, language,
sitehash, authorhash, count, time, maxDistance, global, partitions, target.getHexHash() + ".yacyh", target.getClusterAddress(),
secondarySearchSuperviser, rankingProfile, constraint);
} catch (final IOException e) {
@ -613,6 +614,7 @@ public final class yacyClient {
final String urlhashes,
final Pattern prefer,
final Pattern filter,
final Pattern snippet,
final String language,
final String sitehash,
final String authorhash,
@ -659,8 +661,9 @@ public final class yacyClient {
parts.put("exclude", UTF8.StringBody(excludehashes));
parts.put("duetime", UTF8.StringBody("1000"));
parts.put("urls", UTF8.StringBody(urlhashes));
parts.put("prefer", UTF8.StringBody(prefer.toString()));
parts.put("filter", UTF8.StringBody(filter.toString()));
parts.put("prefer", UTF8.StringBody(prefer.pattern()));
parts.put("filter", UTF8.StringBody(filter.pattern()));
parts.put("snippet", UTF8.StringBody(snippet.pattern()));
parts.put("language", UTF8.StringBody(language));
parts.put("sitehash", UTF8.StringBody(sitehash));
parts.put("authorhash", UTF8.StringBody(authorhash));
@ -1073,8 +1076,9 @@ public final class yacyClient {
UTF8.String(wordhashe),
"", // excludehashes,
"", // urlhashes,
Pattern.compile(""), // prefer,
Pattern.compile(".*"), // filter,
QueryParams.matchnothing_pattern, // prefer,
QueryParams.catchall_pattern, // filter,
QueryParams.catchall_pattern, // snippet,
"", // language,
"", // sitehash,
"", // authorhash,

@ -57,7 +57,7 @@ public class yacySearch extends Thread {
private final int count, maxDistance;
private final long time;
final private RankingProfile rankingProfile;
final private Pattern prefer, filter;
final private Pattern prefer, filter, snippet;
final private String language;
final private Bitfield constraint;
final private yacySeedDB peers;
@ -65,7 +65,9 @@ public class yacySearch extends Thread {
public yacySearch(
final String wordhashes, final String excludehashes,
final String urlhashes,
final Pattern prefer, final Pattern filter,
final Pattern prefer,
final Pattern filter,
final Pattern snippet,
final String language,
final String sitehash, final String authorhash,
final int count, final long time, final int maxDistance,
@ -86,6 +88,7 @@ public class yacySearch extends Thread {
this.urlhashes = urlhashes;
this.prefer = prefer;
this.filter = filter;
this.snippet = snippet;
this.language = language;
this.sitehash = sitehash;
this.authorhash = authorhash;
@ -110,8 +113,9 @@ public class yacySearch extends Thread {
try {
this.urls = yacyClient.search(
peers.mySeed(),
wordhashes, excludehashes, urlhashes, prefer, filter, language,
sitehash, authorhash,
wordhashes, excludehashes, urlhashes,
prefer, filter, snippet,
language, sitehash, authorhash,
count, time, maxDistance, global, partitions,
targetPeer, indexSegment, containerCache, secondarySearchSuperviser,
blacklist, rankingProfile, constraint);
@ -151,7 +155,8 @@ public class yacySearch extends Thread {
public static yacySearch[] primaryRemoteSearches(
final String wordhashes, final String excludehashes,
final Pattern prefer, final Pattern filter, String language,
final Pattern prefer, final Pattern filter, final Pattern snippet,
final String language,
final String sitehash,
final String authorhash,
final int count, long time, final int maxDist,
@ -188,8 +193,8 @@ public class yacySearch extends Thread {
if (targetPeers[i] == null || targetPeers[i].hash == null) continue;
try {
searchThreads[i] = new yacySearch(
wordhashes, excludehashes, "", prefer, filter, language,
sitehash, authorhash,
wordhashes, excludehashes, "", prefer, filter, snippet,
language, sitehash, authorhash,
count, time, maxDist, true, targets, targetPeers[i],
indexSegment, peers, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint);
searchThreads[i].start();
@ -222,7 +227,7 @@ public class yacySearch extends Thread {
if (targetPeer == null || targetPeer.hash == null) return null;
if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(UTF8.getBytes(targetPeer.hash)));
final yacySearch searchThread = new yacySearch(
wordhashes, "", urlhashes, Pattern.compile(""), Pattern.compile(".*"), "", "", "", 20, time, 9999, true, 0, targetPeer,
wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, "", "", "", 20, time, 9999, true, 0, targetPeer,
indexSegment, peers, containerCache, null, blacklist, rankingProfile, constraint);
searchThread.start();
return searchThread;

@ -102,6 +102,7 @@ public final class Condenser {
final boolean indexMedia,
final WordCache meaningLib
) {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag
this.intStringFormatter.setMinimumIntegerDigits(numlength);

@ -392,7 +392,6 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public void add(final ReferenceContainer<ReferenceType> container) throws RowSpaceExceededException {
// this puts the entries into the cache
assert this.cache != null;
if (this.cache == null || container == null || container.isEmpty()) return;
// put new words into cache

Loading…
Cancel
Save