|
|
@ -7,12 +7,12 @@
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
|
|
* along with this program in the file lgpl21.txt
|
|
|
|
* along with this program in the file lgpl21.txt
|
|
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
|
@ -32,10 +32,10 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
|
|
|
|
import net.yacy.kelondro.logging.Log;
|
|
|
|
import net.yacy.kelondro.logging.Log;
|
|
|
|
|
|
|
|
|
|
|
|
public class SnippetExtractor {
|
|
|
|
public class SnippetExtractor {
|
|
|
|
|
|
|
|
|
|
|
|
String snippetString;
|
|
|
|
String snippetString;
|
|
|
|
HandleSet remainingHashes;
|
|
|
|
HandleSet remainingHashes;
|
|
|
|
|
|
|
|
|
|
|
|
public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
|
|
|
|
public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
|
|
|
|
if (sentences == null) throw new UnsupportedOperationException("sentence == null");
|
|
|
|
if (sentences == null) throw new UnsupportedOperationException("sentence == null");
|
|
|
|
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
|
|
|
|
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
|
|
|
@ -47,7 +47,7 @@ public class SnippetExtractor {
|
|
|
|
int linenumber = 0;
|
|
|
|
int linenumber = 0;
|
|
|
|
int fullmatchcounter = 0;
|
|
|
|
int fullmatchcounter = 0;
|
|
|
|
lookup: for (final StringBuilder sentence: sentences) {
|
|
|
|
lookup: for (final StringBuilder sentence: sentences) {
|
|
|
|
hs = WordTokenizer.hashSentence(sentence.toString(), null);
|
|
|
|
hs = WordTokenizer.hashSentence(sentence.toString(), null, 100);
|
|
|
|
positions = new TreeSet<Integer>();
|
|
|
|
positions = new TreeSet<Integer>();
|
|
|
|
for (final byte[] word: queryhashes) {
|
|
|
|
for (final byte[] word: queryhashes) {
|
|
|
|
pos = hs.get(word);
|
|
|
|
pos = hs.get(word);
|
|
|
@ -69,7 +69,7 @@ public class SnippetExtractor {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
linenumber++;
|
|
|
|
linenumber++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
StringBuilder sentence;
|
|
|
|
StringBuilder sentence;
|
|
|
|
SnippetExtractor tsr;
|
|
|
|
SnippetExtractor tsr;
|
|
|
|
while (!order.isEmpty()) {
|
|
|
|
while (!order.isEmpty()) {
|
|
|
@ -79,27 +79,27 @@ public class SnippetExtractor {
|
|
|
|
} catch (UnsupportedOperationException e) {
|
|
|
|
} catch (UnsupportedOperationException e) {
|
|
|
|
continue;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
snippetString = tsr.snippetString;
|
|
|
|
this.snippetString = tsr.snippetString;
|
|
|
|
if (snippetString != null && snippetString.length() > 0) {
|
|
|
|
if (this.snippetString != null && this.snippetString.length() > 0) {
|
|
|
|
remainingHashes = tsr.remainingHashes;
|
|
|
|
this.remainingHashes = tsr.remainingHashes;
|
|
|
|
if (remainingHashes.isEmpty()) {
|
|
|
|
if (this.remainingHashes.isEmpty()) {
|
|
|
|
// we have found the snippet
|
|
|
|
// we have found the snippet
|
|
|
|
return; // finished!
|
|
|
|
return; // finished!
|
|
|
|
} else if (remainingHashes.size() < queryhashes.size()) {
|
|
|
|
} else if (this.remainingHashes.size() < queryhashes.size()) {
|
|
|
|
// the result has not all words in it.
|
|
|
|
// the result has not all words in it.
|
|
|
|
// find another sentence that represents the missing other words
|
|
|
|
// find another sentence that represents the missing other words
|
|
|
|
// and find recursively more sentences
|
|
|
|
// and find recursively more sentences
|
|
|
|
maxLength = maxLength - snippetString.length();
|
|
|
|
maxLength = maxLength - this.snippetString.length();
|
|
|
|
if (maxLength < 20) maxLength = 20;
|
|
|
|
if (maxLength < 20) maxLength = 20;
|
|
|
|
try {
|
|
|
|
try {
|
|
|
|
tsr = new SnippetExtractor(order.values(), remainingHashes, maxLength);
|
|
|
|
tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength);
|
|
|
|
} catch (UnsupportedOperationException e) {
|
|
|
|
} catch (UnsupportedOperationException e) {
|
|
|
|
throw e;
|
|
|
|
throw e;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
final String nextSnippet = tsr.snippetString;
|
|
|
|
final String nextSnippet = tsr.snippetString;
|
|
|
|
if (nextSnippet == null) return;
|
|
|
|
if (nextSnippet == null) return;
|
|
|
|
snippetString = snippetString + (" / " + nextSnippet);
|
|
|
|
this.snippetString = this.snippetString + (" / " + nextSnippet);
|
|
|
|
remainingHashes = tsr.remainingHashes;
|
|
|
|
this.remainingHashes = tsr.remainingHashes;
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
// error
|
|
|
|
// error
|
|
|
@ -110,7 +110,7 @@ public class SnippetExtractor {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
throw new UnsupportedOperationException("no snippet computed");
|
|
|
|
throw new UnsupportedOperationException("no snippet computed");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static int linelengthKey(int givenlength, int maxlength) {
|
|
|
|
private static int linelengthKey(int givenlength, int maxlength) {
|
|
|
|
if (givenlength > maxlength) return 1;
|
|
|
|
if (givenlength > maxlength) return 1;
|
|
|
|
if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7;
|
|
|
|
if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7;
|
|
|
@ -118,15 +118,15 @@ public class SnippetExtractor {
|
|
|
|
if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3;
|
|
|
|
if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3;
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
|
|
|
|
private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
|
|
|
|
try {
|
|
|
|
try {
|
|
|
|
if (sentence == null) throw new UnsupportedOperationException("no sentence given");
|
|
|
|
if (sentence == null) throw new UnsupportedOperationException("no sentence given");
|
|
|
|
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
|
|
|
|
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
|
|
|
|
byte[] hash;
|
|
|
|
byte[] hash;
|
|
|
|
|
|
|
|
|
|
|
|
// find all hashes that appear in the sentence
|
|
|
|
// find all hashes that appear in the sentence
|
|
|
|
final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null);
|
|
|
|
final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null, 100);
|
|
|
|
final Iterator<byte[]> j = queryhashes.iterator();
|
|
|
|
final Iterator<byte[]> j = queryhashes.iterator();
|
|
|
|
Integer pos;
|
|
|
|
Integer pos;
|
|
|
|
int p, minpos = sentence.length(), maxpos = -1;
|
|
|
|
int p, minpos = sentence.length(), maxpos = -1;
|
|
|
@ -189,11 +189,11 @@ public class SnippetExtractor {
|
|
|
|
throw new UnsupportedOperationException(e.getMessage());
|
|
|
|
throw new UnsupportedOperationException(e.getMessage());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public String getSnippet() {
|
|
|
|
public String getSnippet() {
|
|
|
|
return this.snippetString;
|
|
|
|
return this.snippetString;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public HandleSet getRemainingWords() {
|
|
|
|
public HandleSet getRemainingWords() {
|
|
|
|
return this.remainingHashes;
|
|
|
|
return this.remainingHashes;
|
|
|
|
}
|
|
|
|
}
|
|
|
|