Merge branch 'master' of git@github.com:yacy/yacy_search_server.git

pull/77/head
Michael Peter Christen 9 years ago
commit df51e4ef07

@ -16,7 +16,6 @@
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/commons-logging-1.2.jar"/>
<classpathentry kind="lib" path="lib/J7Zip-modified.jar"/>
<classpathentry kind="lib" path="lib/webcat-swf-0.1.jar"/>
<classpathentry kind="lib" path="lib/commons-jxpath-1.3.jar"/>
<classpathentry kind="lib" path="lib/jsch-0.1.53.jar"/>
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.8.jar"/>

@ -171,8 +171,8 @@ Please clone our code and help with development!
The code is licensed under the GPL v2.
Compiling YaCy:
- you need java 1.7 and ant
- just compile: "ant clean all" - then you can "./startYACY.sh"
- you need Java 1.7 or later and [Apache Ant](http://ant.apache.org/)
- just compile: "ant clean all" - then you can "./startYACY.sh" or "./startYACY.bat"
- create a release tarball: "ant dist"
- create a Mac OS release: "ant distMacApp" (works only on a Mac)
- create a debian release: "ant deb"

@ -238,7 +238,6 @@
<pathelement location="${lib}/solr-solrj-5.5.2.jar" />
<pathelement location="${lib}/spatial4j-0.5.jar" />
<pathelement location="${lib}/stax2-api_3.1.4.jar" />
<pathelement location="${lib}/webcat-swf-0.1.jar" />
<pathelement location="${lib}/weupnp-0.1.4.jar" />
<pathelement location="${lib}/woodstox-core-asl-4.4.1.jar" />
<pathelement location="${lib}/xercesImpl.jar" />

@ -74,6 +74,7 @@ public class TransNews_p {
// read voting
if ((post != null) && post.containsKey("publishtranslation")) {
Iterator<String> filenameit = localTrans.keySet().iterator();
int msgcounter = 0;
while (filenameit.hasNext()) {
String file = filenameit.next();
Map<String, String> tmptrans = localTrans.get(file);
@ -109,6 +110,7 @@ public class TransNews_p {
map.put("file", file);
map.put("source", sourcetxt);
map.put("target", targettxt);
map.put("#", Integer.toString(msgcounter++));
sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), NewsPool.CATEGORY_TRANSLATION_ADD, map);
}
}

Binary file not shown.

@ -1,7 +0,0 @@
This package is part of WebCAT (http://webcat.sourceforge.net/)
WebCAT was developed at the XLDB group of the Department of Informatics of the Faculty of Sciences of the University of Lisbon in Portugal.
WebCAT was written by Bruno Martins.
WebCAT is released under the BSD License. (http://www.opensource.org/licenses/bsd-license.php)

@ -591,12 +591,7 @@
<artifactId>slf4j-jdk14</artifactId>
<version>1.7.21</version>
<type>jar</type>
</dependency>
<dependency>
<groupId>pt.tumba</groupId>
<artifactId>webcat-swf</artifactId>
<version>0.1</version>
</dependency>
</dependency>
<dependency>
<groupId>org.bitlet</groupId>
<artifactId>weupnp</artifactId>

@ -56,7 +56,6 @@ import net.yacy.document.parser.rssParser;
import net.yacy.document.parser.rtfParser;
import net.yacy.document.parser.sevenzipParser;
import net.yacy.document.parser.sidAudioParser;
import net.yacy.document.parser.swfParser;
import net.yacy.document.parser.tarParser;
import net.yacy.document.parser.torrentParser;
import net.yacy.document.parser.vcfParser;
@ -106,7 +105,6 @@ public final class TextParser {
initParser(new sevenzipParser());
initParser(new sidAudioParser());
initParser(new svgParser());
initParser(new swfParser());
initParser(new tarParser());
initParser(new torrentParser());
initParser(new vcfParser());

@ -74,9 +74,6 @@ public class Tokenizer {
String k;
Tagging.Metatag tag;
int wordlen;
int wordHandle;
int wordHandleCount = 0;
//final int sentenceHandleCount = 0;
int allwordcounter = 0;
int allsentencecounter = 0;
int wordInSentenceCounter = 1;
@ -167,12 +164,10 @@ public class Tokenizer {
Word wsp = this.words.get(word);
if (wsp != null) {
// word already exists
wordHandle = wsp.posInText;
wsp.inc();
} else {
// word does not yet exist, create new word entry
wordHandle = ++wordHandleCount; // let start pos with 1
wsp = new Word(wordHandle, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 !
wsp = new Word(allwordcounter, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 !
wsp.flags = this.RESULT_FLAGS.clone();
this.words.put(word.toLowerCase(), wsp);
}

@ -1,113 +0,0 @@
//swfParser.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//this file is contributed by Marc Nause
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import pt.tumba.parser.swf.SWF2HTML;
public class swfParser extends AbstractParser implements Parser {
public swfParser() {
super("Adobe Flash Parser");
this.SUPPORTED_EXTENSIONS.add("swf");
this.SUPPORTED_MIME_TYPES.add("application/x-shockwave-flash");
this.SUPPORTED_MIME_TYPES.add("application/x-shockwave-flash2-preview");
this.SUPPORTED_MIME_TYPES.add("application/futuresplash");
this.SUPPORTED_MIME_TYPES.add("image/vnd.rn-realflash");
}
/*
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document
*/
@Override
public Document[] parse(
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException
{
try {
final SWF2HTML swf2html = new SWF2HTML();
String contents = "";
try {
contents = swf2html.convertSWFToHTML(source);
scraperObject = htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100);
} catch (final NegativeArraySizeException e) {
throw new Parser.Failure(e.getMessage(), location);
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
} catch (final Exception e) {
throw new Parser.Failure(e.getMessage(), location);
}
// As the result of parsing this function must return a plasmaParserDocument object
ContentScraper htmlscraper = (ContentScraper) this.scraperObject; // shortcut to access ContentScraper methodes
return new Document[]{new Document(
location, // url of the source document
mimeType, // the documents mime type
StandardCharsets.UTF_8.name(), // charset of the document text
this,
htmlscraper.getContentLanguages(),
htmlscraper.getKeywords(),
htmlscraper.getTitles(),
htmlscraper.getAuthor(),
htmlscraper.getPublisher(),
null, // sections
htmlscraper.getDescriptions(),
htmlscraper.getLon(), htmlscraper.getLat(),
htmlscraper.getText(),
htmlscraper.getAnchors(),
htmlscraper.getRSS(),
null, // images
false,
htmlscraper.getDate())};
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
// if an unexpected error occures just log the error and raise a new Parser.Failure
final String errorMsg = "Unable to parse the swf document '" + location + "':" + e.getMessage();
//AbstractParser.log.logSevere(errorMsg);
throw new Parser.Failure(errorMsg, location);
}
}
}

@ -66,7 +66,7 @@ public class Word {
// object carries statistics for words and sentences
public int count; // number of occurrences
public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
public int posInText; // unique handle, is initialized with first word position in text
public int posInPhrase; // position of word in phrase
public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
public Bitfield flags; // the flag bits for each word

@ -66,7 +66,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
new Column("t", Column.celltype_cardinal, Column.encoder_b256, 2, "posintext"),
new Column("r", Column.celltype_cardinal, Column.encoder_b256, 1, "posinphrase"),
new Column("o", Column.celltype_cardinal, Column.encoder_b256, 1, "posofphrase"),
new Column("i", Column.celltype_cardinal, Column.encoder_b256, 1, "worddistance"),
new Column("i", Column.celltype_cardinal, Column.encoder_b256, 1, "worddistance"), // arbitrary column for avg distance of search query words
new Column("k", Column.celltype_cardinal, Column.encoder_b256, 1, "reserve")
},
Base64Order.enhancedCoder
@ -160,6 +160,12 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry.setCol(col_reserve2, 0);
}
/**
* Constructor for WordReferences from title words or as template for content
* words (with reduced number of input parameters, skipping the parameter
* later set by setWord() for a WordReferenceRow template or not relevant if
* used for words from title).
*/
public WordReferenceRow(final byte[] urlHash,
final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components
@ -253,7 +259,10 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
}
/**
* First position of word in text
* First position of word in text.
* positions() is used to remember word positions for each query word of an
* multi word search query. As we currently don't include a separate posintext()
* function, we use positions to make the posintext value available.
* @return Collection with one element
*/
@Override

@ -253,9 +253,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.hitcount, // how often appears this word in the text
this.wordsintext, // total number of words
this.phrasesintext, // total number of phrases
// TODO: positon 1 on empty positions may give high ranking scores for unknown pos (needs to be checked if 0 would be appropriate)
this.positions.isEmpty() ? -1 : this.positions.iterator().next(), // position of word in all words
this.positions.isEmpty() ? 0 : this.positions.iterator().next(), // position of word in all words (
this.posinphrase, // position of word in its phrase
this.posofphrase, // number of the phrase where word appears
this.lastModified, // last-modified time of the document where word appears

@ -98,17 +98,29 @@ public abstract class AbstractReference implements Reference {
}
return r;
}
/**
* max position of search query words for multi word queries
* @return
*/
@Override
public int maxposition() {
return max(positions());
}
/**
* min word position of search query words for multi word queries
* @return
*/
@Override
public int minposition() {
return min(positions());
}
/**
* The average distance (in words) between search query terms for multi word searches.
* @return word distance
*/
@Override
public int distance() {
if (positions().size() < 2) return 0;

@ -51,14 +51,38 @@ public interface Reference {
@Override
public boolean equals(Object other);
/**
* Joins a Reference into this one, setting the values appropriate for ranking
* @param joined reference
*/
public void join(final Reference oe);
/**
* Positions or search query words for the referenced result url
* This is only valid for multi word search queries.
* The positions contain the first word position for every search query word
* which has been joined (by join() )
* @return list with word position
*/
public Collection<Integer> positions();
/**
* max position of search query words (for multi word queries)
* @return
*/
public int maxposition();
/**
* min word position of search query words (for multi word queries)
* @return
*/
public int minposition();
/**
* The average distance (in words) between search query terms for multi word searches.
* The distance is calculated from positions()
* @return word distance
*/
public int distance();
}

@ -261,12 +261,33 @@ public class NewsDB {
removeStandards();
}
/**
* Create a new news record and assign data for the unique message id.
* This is composed of Date-String + PeerHash. Date string is with precision
* seconds.
*
* To allow programatically to create more than one message per second
* a counter which is added to the date part can be given as attritues
* with key "#"
* @param mySeed
* @param category
* @param attributes
*/
private Record(final Seed mySeed, final String category, final Map<String, String> attributes) {
if (category.length() > NewsDB.categoryStringLength) throw new IllegalArgumentException("category length (" + category.length() + ") exceeds maximum (" + NewsDB.categoryStringLength + ")");
if (attributes.toString().length() > NewsDB.this.attributesMaxLength) throw new IllegalArgumentException("attributes length (" + attributes.toString().length() + ") exceeds maximum (" + NewsDB.this.attributesMaxLength + ")");
this.attributes = attributes;
this.received = null;
this.created = new Date();
// workaround for publishing multiple messages. Message id must be unique and is generated date-sting(yyyyMMddhhmmss)+peerhash
// publishing programatically 2 messages creates same id (because created in the same second). To work around this, if map contains
// key="#" use value as message counter and put this conter as offset in seconds to the id time part 20161231123001 .. 20161231123002 ..
if (attributes.containsKey("#")) {
int cnt = Integer.parseInt(attributes.get("#")); // get number used as counter/offset added as second
// add (counter * millisecond)
this.created = new Date(System.currentTimeMillis() + (cnt * 1000));
} else {
this.created = new Date();
}
this.category = category;
this.distributed = 0;
this.originator = mySeed.hash;
@ -295,6 +316,7 @@ public class NewsDB {
this.attributes.remove("cre");
this.attributes.remove("rec");
this.attributes.remove("dis");
this.attributes.remove("#"); // special attribute for id offset (see Record(mySeed... )
}
@Override
@ -350,4 +372,4 @@ public class NewsDB {
}
}
}
}

@ -259,6 +259,10 @@ public final class Protocol {
mySeed.setIPs(Switchboard.getSwitchboard().myPublicIPs());
} else {
final String myIP = result.get("yourip");
if (myIP == null) {
Network.log.info("yacyClient.hello result error: Peer sent incompleet hello message (key yourip is missing)");
return null; // no success
}
// with the IPv6 extension, this may contain several ips, separated by comma ','
HashSet<String> h = new HashSet<>();
for (String s: CommonPattern.COMMA.split(myIP)) {

@ -226,8 +226,8 @@ public final class Fulltext {
for (String name: instance.getCoreNames()) {
this.solrInstances.getEmbeddedConnector(name).clear();
}
this.commit(false);
}
this.commit(false);
this.solrInstances.clearCaches();
}
}

@ -145,29 +145,6 @@ public final class yacy {
System.out.println(copyright);
System.out.println(hline);
// check java version
try {
"a".isEmpty(); // needs at least Java 1.6
// check java version string (required min 1.7)
final String javaVersion = System.getProperty("java.version");
if (javaVersion != null) { // unknown property !?
int pos = javaVersion.indexOf('.');
int count = 1;
for (; pos < javaVersion.length() && count < 2; pos++) {
if (javaVersion.charAt(pos + 1) == '.') count++;
}
Double dVersion = Double.parseDouble(javaVersion.substring(0, pos));
if (dVersion < 1.7) { // required min java 1.7
System.err.println("STARTUP: Java Version too low. You need at least Java 1.7 to run YaCy");
System.exit(-1);
}
}
} catch (final NoSuchMethodError e) {
System.err.println("STARTUP: Java Version too low. You need at least Java 1.7 to run YaCy");
System.exit(-1);
}
// ensure that there is a DATA directory, if not, create one and if that fails warn and die
mkdirsIfNeseccary(dataHome);
mkdirsIfNeseccary(appHome);

@ -0,0 +1,198 @@
package net.yacy.search.index;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Tokenizer;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.Bitfield;
import static net.yacy.search.index.Segment.catchallWord;
import net.yacy.search.query.QueryGoal;
import org.junit.AfterClass;
import static org.junit.Assert.assertTrue;
import org.junit.BeforeClass;
import org.junit.Test;
public class SegmentTest {
static Segment index;
/**
* Setup RWI index
*
* @throws IOException
*/
@BeforeClass
public static void setUpClass() throws IOException {
// setup a index segment
index = new Segment(new ConcurrentLog("SegmentTest"),
new File("test/DATA/INDEX/webportal/SEGMENTS"),
new File("test/DATA/INDEX/webportal/ARCHIVE"),
null, null);
// connect RWI index
index.connectRWI(10, 1024);
}
@AfterClass
public static void tearDownClass() {
index.close();
ConcurrentLog.shutdown();
}
/**
* Test of clear method (for RWI), of class Segment.
*/
@Test
public void testClear() throws MalformedURLException, IOException, SpaceExceededException {
DigestURL url = new DigestURL("http://test.org/test.html");
int urlComps = MultiProtocolURL.urlComps(url.toNormalform(true)).length;
int urlLength = url.toNormalform(true).length();
byte[] termHash = Word.word2hash("test");
Word word = new Word(1, 1, 1);
word.flags = new Bitfield(4); // flags must not be null
WordReferenceRow ientry = new WordReferenceRow(
url.hash(), urlLength, urlComps, 0, 1, 1,
System.currentTimeMillis(), System.currentTimeMillis(),
UTF8.getBytes("en"), Response.DT_TEXT, 0, 0);
ientry.setWord(word);
// add a dummy Word and WordReference
index.termIndex.add(termHash, ientry);
// check index count
long cnt = index.RWICount();
assertTrue(cnt > 0);
index.clear();
// check index count after clear
cnt = index.RWICount();
assertTrue(cnt == 0);
}
/**
* Helper to store a text to the rwi index. This was derived from the
* Segment.storeDocument() procedure.
*
* @param text of the document
* @throws IOException
* @throws SpaceExceededException
*/
private void storeTestDocTextToTermIndex(DigestURL url, String text) throws IOException, SpaceExceededException {
// set a pseudo url for the simulated test document
final String urlNormalform = url.toNormalform(true);
String dc_title = "Test Document";
// STORE PAGE INDEX INTO WORD INDEX DB
// create a word prototype which is re-used for all entries
if (index.termIndex != null) {
final int outlinksSame = 0;
final int outlinksOther = 0;
final int urlLength = urlNormalform.length();
final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
final int wordsintitle = CommonPattern.SPACES.split(dc_title).length; // same calculation as for CollectionSchema.title_words_val
WordCache meaningLib = new WordCache(null);
boolean doAutotagging = false;
VocabularyScraper scraper = null;
Tokenizer t = new Tokenizer(url, text, meaningLib, doAutotagging, scraper);
// create a WordReference template
final WordReferenceRow ientry = new WordReferenceRow(
url.hash(), urlLength, urlComps, wordsintitle,
t.RESULT_NUMB_WORDS, t.RESULT_NUMB_SENTENCES,
System.currentTimeMillis(), System.currentTimeMillis(),
UTF8.getBytes("en"), Response.DT_TEXT,
outlinksSame, outlinksOther);
// add the words to rwi index
Word wprop = null;
byte[] wordhash;
String word;
for (Map.Entry<String, Word> wentry : t.words().entrySet()) {
word = wentry.getKey();
wprop = wentry.getValue();
assert (wprop.flags != null);
ientry.setWord(wprop);
wordhash = Word.word2hash(word);
if (this.index != null) {
index.termIndex.add(wordhash, ientry);
}
}
}
}
/**
* Simulates a multi word query for the rwi termIndex
*
* @throws SpaceExceededException
* @throws MalformedURLException
* @throws IOException
*/
@Test
public void testQuery_MultiWordQuery() throws SpaceExceededException, MalformedURLException, IOException {
// creates one test url with this text in the rwi index
DigestURL url = new DigestURL("http://test.org/test.html");
storeTestDocTextToTermIndex(url, "One Two Three Four Five. This is a test text. One two three for five");
// create a query to get the search word hashsets
QueryGoal qg = new QueryGoal("five test ");
HandleSet queryHashes = qg.getIncludeHashes();
HandleSet excludeHashes = qg.getExcludeHashes();
HandleSet urlselection = null;
ReferenceFactory<WordReference> termFactory = Segment.wordReferenceFactory;
// do the search
TermSearch<WordReference> result = index.termIndex.query(queryHashes, excludeHashes, urlselection, termFactory, Integer.MAX_VALUE);
// get the joined resutls
ReferenceContainer<WordReference> wc = result.joined();
// we should have now one result (stored to index above)
assertTrue("test url hash in result set", wc.has(url.hash()));
// the returned WordReference is expected to be a joined Reference with properties set used in ranking
Iterator<WordReference> it = wc.entries();
System.out.println("-----------------");
// currently the results are not as expected for a multi-word query
while (it.hasNext()) {
WordReference r = it.next();
// expected to be 1st in text
System.out.println("posintext=" + r.positions() + " (expected=5)");
// min position of search word in text
System.out.println("minposition=" + r.minposition() + " (expected=5)");
// max position of search word in text
System.out.println("maxposition=" + r.maxposition() + " (expected=8)");
// for a multiword query distance expected to be the avg of search word positions in text
System.out.println("distance=" + r.distance() + " (expected=3)");
// occurence of search words in text
System.out.println("hitcount=" + r.hitcount() + " (expected=2)");
}
System.out.println("-----------------");
}
}
Loading…
Cancel
Save