small features, some bug fixes and performance hacks

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7733 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent e55c254f7b
commit 3ed4a09368

@ -1020,4 +1020,4 @@ color_searchurlhover = #008000
# - to check whats in solr after indexing, open http://localhost:8983/solr/admin/
federated.service.solr.indexing.enabled = false
federated.service.solr.indexing.url = http://127.0.0.1:8983/solr
federated.service.solr.indexing.scheme = SolrCell
federated.service.solr.indexing.scheme = SolrCellExtended

@ -273,7 +273,7 @@ public class Load_RSS_p {
if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop;
sb.addToIndex(messageurl, null, null);
RSSLoader.indexTriggered.put(messageurl.hash(), new Date());
RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date());
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {

@ -90,7 +90,7 @@ public class RSSLoader extends Thread {
if (indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop;
sb.addToIndex(messageurl, null, null);
indexTriggered.put(messageurl.hash(), new Date());
indexTriggered.insertIfAbsent(messageurl.hash(), new Date());
loadCount++;
} catch (IOException e) {
Log.logException(e);

@ -54,6 +54,12 @@ public class ZURL implements Iterable<ZURL.Entry> {
private static final int EcoFSBufferSize = 2000;
private static final int maxStackSize = 1000;
public enum FailCategory {
NETWORK_FAILURE, // an entity could not been loaded
CRAWL_RULE, // the crawler configuration does not want to load the entity
ROBOTS_RULE; // a remote server denies indexing or loading
}
private final static Row rowdef = new Row(
"String urlhash-" + Word.commonHashLength + ", " + // the url's hash
"String executor-" + Word.commonHashLength + ", " + // the crawling executor

@ -547,7 +547,7 @@ public final class Switchboard extends serverSwitch {
// set up the solr interface
String solrurl = this.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
boolean usesolr = this.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurl.length() > 0;
this.solrConnector = (usesolr) ? new SolrSingleConnector(solrurl, SolrScheme.SolrCell) : null;
this.solrConnector = (usesolr) ? new SolrSingleConnector(solrurl, SolrScheme.SolrCellExtended) : null;
// start a loader
log.logConfig("Starting Crawl Loader");

@ -91,7 +91,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
if (cache.containsKey(key)) return;
// learn new snippet
cache.put(key, snippet);
cache.insertIfAbsent(key, snippet);
}
public String get(final String wordhashes, final String urlhash) {

@ -312,8 +312,9 @@ public final class serverCore extends AbstractBusyThread implements BusyThread {
try {
// prepare for new connection
// idleThreadCheck();
this.switchboard.handleBusyState(getJobCount());
if (this.log.isFinest()) this.log.logFinest("* waiting for connections, " + getJobCount() + " sessions running");
int jobCount = getJobCount();
this.switchboard.handleBusyState(jobCount);
if (this.log.isFinest()) this.log.logFinest("* waiting for connections, " + jobCount + " sessions running");
announceThreadBlockApply();
@ -322,10 +323,9 @@ public final class serverCore extends AbstractBusyThread implements BusyThread {
announceThreadBlockRelease();
int pp/*, trycount = 0*/;
if ((pp = getJobCount()) >= this.maxBusySessions) {
if (jobCount >= this.maxBusySessions) {
terminateOldSessions(3000);
this.log.logInfo("termination of old sessions: before = " + pp + ", after = " + getJobCount());
this.log.logInfo("termination of old sessions: before = " + jobCount + ", after = " + getJobCount());
//if (getJobCount() < this.maxBusySessions) break;
//if (trycount++ > 5) break;
//Thread.sleep(1000); // lets try again after a short break
@ -458,7 +458,35 @@ public final class serverCore extends AbstractBusyThread implements BusyThread {
}
public int getJobCount() {
return getJobList().size();
final Thread[] threadList = new Thread[sessionThreadGroup.activeCount()];
serverCore.sessionThreadGroup.enumerate(threadList, false);
int c = 0;
for (Thread t: threadList) {
if (t == null) continue;
if (!(t instanceof Session)) {
//log.logSevere("serverCore.getJobList - thread is not Session: " + t.getClass().getName());
continue;
}
c++;
}
return c;
}
// idle sensor: the thread is idle if there are no sessions running
public boolean idle() {
// idleThreadCheck();
final Thread[] threadList = new Thread[sessionThreadGroup.activeCount()];
serverCore.sessionThreadGroup.enumerate(threadList, false);
for (Thread t: threadList) {
if (t == null) continue;
if (!(t instanceof Session)) {
//log.logSevere("serverCore.getJobList - thread is not Session: " + t.getClass().getName());
continue;
}
return false;
}
return true;
//return (getJobCount() == 0);
}
public int getMaxSessionCount() {
@ -468,12 +496,6 @@ public final class serverCore extends AbstractBusyThread implements BusyThread {
public void setMaxSessionCount(final int count) {
this.maxBusySessions = count;
}
// idle sensor: the thread is idle if there are no sessions running
public boolean idle() {
// idleThreadCheck();
return (getJobCount() == 0);
}
public final class Session extends Thread {

@ -53,7 +53,7 @@ public class yacyPeerActions {
userAgents = null;
}
public synchronized boolean connectPeer(final yacySeed seed, final boolean direct) {
public boolean connectPeer(final yacySeed seed, final boolean direct) {
// store a remote peer's seed
// returns true if the peer is new and previously unknown
if (seed == null) {

@ -501,7 +501,7 @@ public class Domains {
final Collection<String> hosts = NAME_CACHE_HIT.getKeys(i);
if (!hosts.isEmpty()) return hosts.iterator().next();
final String host = i.getHostName();
NAME_CACHE_HIT.put(host, i);
NAME_CACHE_HIT.insertIfAbsent(host, i);
return host;
/*
// call i.getHostName() using concurrency to interrupt execution in case of a time-out
@ -560,14 +560,14 @@ public class Domains {
ip = InetAddress.getByName(host); //TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone
} catch (final UnknownHostException e) {
// add new entries
NAME_CACHE_MISS.put(host, PRESENT);
NAME_CACHE_MISS.insertIfAbsent(host, PRESENT);
LOOKUP_SYNC.remove(host);
return null;
}
if (ip != null && !ip.isLoopbackAddress() && !matchesList(host, nameCacheNoCachingPatterns)) {
// add new ip cache entries
NAME_CACHE_HIT.put(host, ip);
NAME_CACHE_HIT.insertIfAbsent(host, ip);
// add also the isLocal host name caches
boolean localp = ip.isAnyLocalAddress() || ip.isLinkLocalAddress() || ip.isSiteLocalAddress();

@ -44,15 +44,16 @@ import org.apache.solr.common.SolrInputDocument;
public enum SolrScheme {
SolrCell,
SolrCellExtended,
DublinCore;
public SolrInputDocument yacy2solr(String id, ResponseHeader header, Document document) {
if (this == SolrCell) return yacy2solrSolrCell(id, header, document);
if (this == SolrCellExtended) return yacy2solrSolrCellExtended(id, header, document);
return null;
}
public static SolrInputDocument yacy2solrSolrCell(String id, ResponseHeader header, Document yacydoc) {
public static SolrInputDocument yacy2solrSolrCellExtended(String id, ResponseHeader header, Document yacydoc) {
// we user the SolrCell design as index scheme
SolrInputDocument solrdoc = new SolrInputDocument();
DigestURI digestURI = new DigestURI(yacydoc.dc_source());
@ -199,7 +200,7 @@ public enum SolrScheme {
if (frames.length > 0) solrdoc.addField("attr_frames", frames);
// IFrames
Set<MultiProtocolURI> iframess = html.getFrames();
Set<MultiProtocolURI> iframess = html.getIFrames();
String[] iframes = new String[iframess.size()];
c = 0;
for (MultiProtocolURI entry: iframess) {

@ -170,28 +170,6 @@ public class SolrSingleConnector {
}
}
/*
public void addx(File file, String solrId) throws IOException {
ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
ModifiableSolrParams params = new ModifiableSolrParams();
List<ContentStream> contentStreams = new ArrayList<ContentStream>();
contentStreams.add(new ContentStreamBase.FileStream(file));
params.set("literal.id", solrId);
params.set("uprefix", "attr_");
params.set("fmap.content", "attr_content");
params.set( UpdateParams.COMMIT, "true" );
params.set( UpdateParams.WAIT_FLUSH, String.valueOf(true));
params.set( UpdateParams.WAIT_SEARCHER, String.valueOf(true));
try {
server.
server.request(up);
} catch (SolrServerException e) {
throw new IOException(e);
}
}
*/
public void add(String id, ResponseHeader header, Document doc) throws IOException {
add(this.scheme.yacy2solr(id, header, doc));
}
@ -297,7 +275,7 @@ public class SolrSingleConnector {
public static void main(String args[]) {
SolrSingleConnector solr;
try {
solr = new SolrSingleConnector("http://127.0.0.1:8983/solr", SolrScheme.SolrCell);
solr = new SolrSingleConnector("http://127.0.0.1:8983/solr", SolrScheme.SolrCellExtended);
solr.clear();
File exampleDir = new File("/Data/workspace2/yacy/test/parsertest/");
long t, t0, a = 0;

@ -49,17 +49,27 @@ public interface ARC<K, V> extends Iterable<Map.Entry<K, V>> {
/**
* put a value to the cache.
* do not return a previous content value
* @param s
* @param v
*/
public void insert(K s, V v);
/**
* put a value to the cache if there was not an entry before
* do not return a previous content value
* @param s
* @param v
*/
public void insertIfAbsent(K s, V v);
/**
* put a value to the cache.
* @param s
* @param v
*/
public V put(K s, V v);
/**
* get a value from the cache.

@ -89,7 +89,17 @@ public final class ConcurrentARC<K, V> extends AbstractMap<K, V> implements Map<
* @param v
*/
public final void insert(final K s, final V v) {
this.arc[getPartition(s)].put(s, v);
this.arc[getPartition(s)].insert(s, v);
}
/**
* put a value to the cache if there was not an entry before
* do not return a previous content value
* @param s
* @param v
*/
public void insertIfAbsent(K s, V v) {
this.arc[getPartition(s)].insertIfAbsent(s, v);
}
/**

@ -48,7 +48,7 @@ import java.util.Set;
abstract class SimpleARC<K, V> extends AbstractMap<K, V> implements Map<K, V>, Iterable<Map.Entry<K, V>>, ARC<K, V> {
protected int cacheSize;
protected Map<K, V> levelA, levelB;
protected Map<K, V> levelA, levelB; // we can assume that these maps are synchronized
/**
* put a value to the cache.
@ -64,6 +64,32 @@ abstract class SimpleARC<K, V> extends AbstractMap<K, V> implements Map<K, V>, I
assert (this.levelA.size() <= cacheSize); // the cache should shrink automatically
}
}
/**
* put a value to the cache if there was not an entry before
* do not return a previous content value
* @param s
* @param v
*/
public void insertIfAbsent(K s, V v) {
if (this.levelB.containsKey(s)) {
return;
} else if (this.levelA.containsKey(s)) {
return;
} else {
synchronized (this) {
// we must repeat the tests again because we did this in a not synchronized environment
if (this.levelB.containsKey(s)) {
return;
} else if (this.levelA.containsKey(s)) {
return;
} else {
this.levelA.put(s, v);
assert (this.levelA.size() <= cacheSize); // the cache should shrink automatically
}
}
}
}
/**
* put a value to the cache.
@ -93,6 +119,11 @@ abstract class SimpleARC<K, V> extends AbstractMap<K, V> implements Map<K, V>, I
V v = this.levelB.get(s);
if (v != null) return v;
synchronized (this) {
// we must repeat the get here because another thread may have moved the
// entry from A to B meanwhile
v = this.levelB.get(s);
if (v != null) return v;
// now get and move the entry to B
v = this.levelA.remove(s);
if (v == null) return null;
// move value from A to B; since it was already removed from A, just put it to B

@ -138,7 +138,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
/**
* evaluation scores: count appearance of specific attributes
*/
private Evaluation.Scores evaluationScores;
private Evaluation evaluationScores;
@SuppressWarnings("unchecked")
public ContentScraper(final MultiProtocolURI root) {
@ -146,7 +146,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// it is only the reference for relative links
super(linkTags0, linkTags1);
this.root = root;
this.evaluationScores = new Evaluation.Scores();
this.evaluationScores = new Evaluation();
this.rss = new HashMap<MultiProtocolURI, String>();
this.css = new HashMap<MultiProtocolURI, String>();
this.anchors = new HashMap<MultiProtocolURI, Properties>();
@ -165,7 +165,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.htmlFilterEventListeners = new EventListenerList();
this.lon = 0.0f;
this.lat = 0.0f;
Evaluation.match(Element.url, root.toNormalform(false, false), this.evaluationScores);
this.evaluationScores.match(Element.url, root.toNormalform(false, false));
}
public void scrapeText(final char[] newtext, final String insideTag) {
@ -173,7 +173,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
int p, pl, q, s = 0;
// match evaluation pattern
Evaluation.match(Element.text, newtext, this.evaluationScores);
this.evaluationScores.match(Element.text, newtext);
// try to find location information in text
// Opencaching:
@ -289,27 +289,29 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
} catch (final NumberFormatException e) {}
Evaluation.match(Element.imgpath, src, this.evaluationScores);
this.evaluationScores.match(Element.imgpath, src);
} else if(tagname.equalsIgnoreCase("base")) {
try {
root = new MultiProtocolURI(tagopts.getProperty("href", ""));
} catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
frames.add(absolutePath(tagopts.getProperty("src", "")));
MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
anchors.put(src, tagopts /* with property "name" */);
frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("body")) {
String c = tagopts.getProperty("class", "");
Evaluation.match(Element.bodyclass, c, this.evaluationScores);
this.evaluationScores.match(Element.bodyclass, c);
} else if (tagname.equalsIgnoreCase("div")) {
String id = tagopts.getProperty("id", "");
Evaluation.match(Element.divid, id, this.evaluationScores);
this.evaluationScores.match(Element.divid, id);
} else if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", "");
String content = tagopts.getProperty("content","");
if (name.length() > 0) {
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
if (name.equals("generator")) {
Evaluation.match(Element.metagenerator, content, this.evaluationScores);
this.evaluationScores.match(Element.metagenerator, content);
}
} else {
name = tagopts.getProperty("http-equiv", "");
@ -340,7 +342,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
rss.put(newLink, linktitle);
} else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
css.put(newLink, rel);
Evaluation.match(Element.csspath, href, this.evaluationScores);
this.evaluationScores.match(Element.csspath, href);
} else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
Properties p = new Properties(); p.put("name", linktitle);
anchors.put(newLink, p);
@ -377,7 +379,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
anchors.put(url, tagopts);
}
}
Evaluation.match(Element.apath, href, this.evaluationScores);
this.evaluationScores.match(Element.apath, href);
}
final String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
@ -413,17 +415,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
h = recursiveParse(text);
if (h.length() > 0) li.add(h);
} else if (tagname.equalsIgnoreCase("iframe")) {
String src = tagopts.getProperty("src", "");
anchors.put(absolutePath(src), tagopts /* with property "name" */);
iframes.add(absolutePath(src));
Evaluation.match(Element.iframepath, src, this.evaluationScores);
MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
anchors.put(src, tagopts /* with property "name" */);
iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("script")) {
String src = tagopts.getProperty("src", "");
if (src.length() > 0) {
script.add(absolutePath(src));
Evaluation.match(Element.scriptpath, src, this.evaluationScores);
this.evaluationScores.match(Element.scriptpath, src);
} else {
Evaluation.match(Element.scriptcode, text, this.evaluationScores);
this.evaluationScores.match(Element.scriptcode, text);
}
}
@ -433,7 +435,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeComment(final char[] comment) {
Evaluation.match(Element.comment, comment, this.evaluationScores);
this.evaluationScores.match(Element.comment, comment);
}
private String recursiveParse(final char[] inlineHtml) {

@ -68,6 +68,7 @@ public class Evaluation {
url,
scriptpath,
scriptcode,
framepath,
iframepath,
imgpath,
apath,
@ -156,32 +157,29 @@ public class Evaluation {
}
}
public static class Scores {
private Map<String, ClusteredScoreMap<String>> modelMap; // a map from model names to attribute scores
public Scores() {
this.modelMap = new HashMap<String, ClusteredScoreMap<String>>();
}
/**
* produce all model names
* @return a set of model names
*/
public Set<String> getModelNames() {
return this.modelMap.keySet();
}
/**
* calculate the scores for a model
* the scores is a attribute/count map which count how often a specific attribute was found
* @param modelName
* @return
*/
public ClusteredScoreMap<String> getScores(String modelName) {
return this.modelMap.get(modelName);
}
private final Map<String, ClusteredScoreMap<String>> modelMap; // a map from model names to attribute scores
public Evaluation() {
this.modelMap = new HashMap<String, ClusteredScoreMap<String>>();
}
/**
* produce all model names
* @return a set of model names
*/
public Set<String> getModelNames() {
return this.modelMap.keySet();
}
/**
* calculate the scores for a model
* the scores is a attribute/count map which count how often a specific attribute was found
* @param modelName
* @return
*/
public ClusteredScoreMap<String> getScores(String modelName) {
return this.modelMap.get(modelName);
}
/**
@ -199,25 +197,24 @@ public class Evaluation {
* this will increase statistic counters for models if a model matches
* @param element - the element where a matching is made
* @param content - the content of the element which shall be matched
* @param scores - the score object where the scores are stored
*/
public static void match(Element element, String content, Scores scores) {
public void match(Element element, String content) {
if (models.isEmpty()) return; // fast return if this feature is not used
ClusteredScoreMap<String> newScores, oldScores;
for (Model pattern: models) {
newScores = pattern.match(element, content);
oldScores = scores.getScores(pattern.getName());
oldScores = this.getScores(pattern.getName());
if (oldScores == null) {
oldScores = new ClusteredScoreMap<String>();
scores.modelMap.put(pattern.getName(), oldScores);
this.modelMap.put(pattern.getName(), oldScores);
}
oldScores.inc(newScores);
}
}
public static void match(Element element, char[] content, Scores scores) {
public void match(Element element, char[] content) {
if (models.isEmpty()) return; // fast return if this feature is not used
match(element, new String(content), scores);
match(element, new String(content));
}
}

@ -160,7 +160,7 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
if (MemoryControl.shortStatus()) {
cache.clear();
} else {
cache.put(key, newMap);
cache.insert(key, newMap);
}
}
}
@ -304,7 +304,7 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
cache.clear();
} else {
// write map to cache
cache.put(key, map);
cache.insert(key, map);
}
}

@ -55,7 +55,7 @@ public class Word {
*/
public static final int commonHashLength = 12;
private static final int hashCacheSize = Math.max(100000, Math.min(10000000, (int) (MemoryControl.available() / 20000L)));
private static final int hashCacheSize = Math.max(200000, Math.min(10000000, (int) (MemoryControl.available() / 20000L)));
private static ARC<String, byte[]> hashCache = null;
static {
try {
@ -122,7 +122,7 @@ public class Word {
if (MemoryControl.shortStatus()) {
hashCache.clear();
} else {
hashCache.put(wordlc, h); // prevent expensive MD5 computation and encoding
hashCache.insertIfAbsent(wordlc, h); // prevent expensive MD5 computation and encoding
}
return h;
}

@ -189,7 +189,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
}
// put count result into cache
if (MemoryControl.shortStatus()) this.countCache.clear();
this.countCache.put(termHash, c);
this.countCache.insert(termHash, c);
return c;
}

Loading…
Cancel
Save