From 48c0d508acf49fb3ac89033bdfd2ac3140517643 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sat, 25 Sep 2010 22:32:26 +0000 Subject: [PATCH] fixes for crawling of smb links (file length not always available) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7190 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacysearchitem.java | 6 ++-- .../anomic/crawler/retrieval/FileLoader.java | 7 +++- .../anomic/crawler/retrieval/SMBLoader.java | 7 +++- source/de/anomic/search/DocumentIndex.java | 8 ++++- source/de/anomic/yacy/yacySeed.java | 10 +++--- .../yacy/cora/document/MultiProtocolURI.java | 32 ++++++++++--------- source/net/yacy/kelondro/util/FileUtils.java | 8 ++--- 7 files changed, 50 insertions(+), 28 deletions(-) diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 453d26208..8180913b1 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -31,6 +31,7 @@ import java.util.TreeSet; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.EventTracker; import net.yacy.kelondro.util.Formatter; @@ -108,6 +109,7 @@ public class yacysearchitem { if (!result.url().isLocal()) try { faviconURL = new DigestURI(result.url().getProtocol() + "://" + result.url().getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico", null); } catch (final MalformedURLException e1) { + Log.logException(e1); faviconURL = null; } @@ -124,8 +126,8 @@ public class yacysearchitem { prop.putJSON("content_title-json", result.title()); prop.putHTML("content_link", result.urlstring()); prop.put("content_display", display); - if (isHtml) sb.loader.loadIfNotExistBackground(faviconURL.toNormalform(true, false), 1024 * 1024 * 10); - prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // aquire license for favicon url loading + if (faviconURL != null && isHtml) sb.loader.loadIfNotExistBackground(faviconURL.toNormalform(true, false), 1024 * 1024 * 10); + prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // acquire license for favicon url loading prop.put("content_urlhash", resulthashString); prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(resulthashString)); prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), urllength)); diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java index 785356a55..de3b055a3 100644 --- a/source/de/anomic/crawler/retrieval/FileLoader.java +++ b/source/de/anomic/crawler/retrieval/FileLoader.java @@ -103,7 +103,12 @@ public class FileLoader { // check mime type and availability of parsers // and also check resource size and limitation of the size - long size = url.length(); + long size; + try { + size = url.length(); + } catch (Exception e) { + size = -1; + } String parserError = null; if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || (size > maxFileSize && maxFileSize >= 0)) { diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index e755b90bb..0dc563d02 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -122,7 +122,12 @@ public class SMBLoader { // check mime type and availability of parsers // and also check resource size and limitation of the size - long size = url.length(); + long size; + try { + size = url.length(); + } catch (Exception e) { + size = -1; + } String parserError = null; if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || (size > maxFileSize && maxFileSize >= 0)) { diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java index 7ccd754c0..46992bd84 100644 --- a/source/de/anomic/search/DocumentIndex.java +++ b/source/de/anomic/search/DocumentIndex.java @@ -121,8 +121,14 @@ public class DocumentIndex extends Segment { if (url.isDirectory()) throw new IOException("file should be a document, not a path"); if (!url.canRead()) throw new IOException("cannot read file"); Document[] documents; + long length; try { - documents = TextParser.parseSource(url, null, null, url.length(), url.getInputStream(null, -1)); + length = url.length(); + } catch (Exception e) { + length = -1; + } + try { + documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1)); } catch (Exception e) { throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } diff --git a/source/de/anomic/yacy/yacySeed.java b/source/de/anomic/yacy/yacySeed.java index ff1b5e311..7b9f72d3e 100644 --- a/source/de/anomic/yacy/yacySeed.java +++ b/source/de/anomic/yacy/yacySeed.java @@ -657,6 +657,10 @@ public class yacySeed implements Cloneable { public final boolean isOnline(final String type) { return type.equals(yacySeed.PEERTYPE_SENIOR) || type.equals(yacySeed.PEERTYPE_PRINCIPAL); } + + public long nextLong(Random random, long n) { + return Math.abs(random.nextLong()) % n; + } private static byte[] bestGap(final yacySeedDB seedDB) { byte[] randomHash = randomHash(); @@ -678,10 +682,8 @@ public class yacySeed implements Cloneable { // find dht position and size of gap long left = FlatWordPartitionScheme.std.dhtPosition(interval.substring(0, 12).getBytes(), null); long right = FlatWordPartitionScheme.std.dhtPosition(interval.substring(12).getBytes(), null); - final long gap4 = FlatWordPartitionScheme.dhtDistance(left, right) >> 2; // a quarter of a gap - long gapx = gap4; - if (random.nextBoolean()) gapx += gap4; - if (random.nextBoolean()) gapx += gap4; + final long gap8 = FlatWordPartitionScheme.dhtDistance(left, right) >> 3; // 1/8 of a gap + long gapx = gap8 + (Math.abs(random.nextLong()) % (6 * gap8)); long gappos = (Long.MAX_VALUE - left >= gapx) ? left + gapx : (left - Long.MAX_VALUE) + gapx; byte[] computedHash = FlatWordPartitionScheme.positionToHash(gappos); // the computed hash is the perfect position (modulo gap4 population and gap alternatives) diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 1c761d6fd..c58856a03 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -59,6 +59,7 @@ public class MultiProtocolURI implements Serializable { private static final Pattern patternBackSlash = Pattern.compile("\\\\"); private static final Pattern patternAmp = Pattern.compile("&"); private static final Pattern patternMail = Pattern.compile("^[a-z]+:.*?"); + //private static final Pattern patternSpace = Pattern.compile("%20"); // session id handling private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); @@ -116,6 +117,7 @@ public class MultiProtocolURI implements Serializable { // identify protocol assert (url != null); url = url.trim(); + //url = patternSpace.matcher(url).replaceAll(" "); if (url.startsWith("\\\\")) { url = "smb://" + patternBackSlash.matcher(url.substring(2)).replaceAll("/"); } @@ -857,9 +859,9 @@ public class MultiProtocolURI implements Serializable { if (isSMB()) try { return getSmbFile().exists(); } catch (SmbException e) { - throw new IOException("SMB.exists SmbException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.exists SmbException (" + e.getMessage() + ") for " + this.toString()); } catch (MalformedURLException e) { - throw new IOException("SMB.exists MalformedURLException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.exists MalformedURLException (" + e.getMessage() + ") for " + this.toString()); } return false; } @@ -869,9 +871,9 @@ public class MultiProtocolURI implements Serializable { if (isSMB()) try { return getSmbFile().canRead(); } catch (SmbException e) { - throw new IOException("SMB.canRead SmbException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.canRead SmbException (" + e.getMessage() + ") for " + this.toString()); } catch (MalformedURLException e) { - throw new IOException("SMB.canRead MalformedURLException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.canRead MalformedURLException (" + e.getMessage() + ") for " + this.toString()); } return false; } @@ -881,9 +883,9 @@ public class MultiProtocolURI implements Serializable { if (isSMB()) try { return getSmbFile().canWrite(); } catch (SmbException e) { - throw new IOException("SMB.canWrite SmbException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.canWrite SmbException (" + e.getMessage() + ") for " + this.toString()); } catch (MalformedURLException e) { - throw new IOException("SMB.canWrite MalformedURLException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.canWrite MalformedURLException (" + e.getMessage() + ") for " + this.toString()); } return false; } @@ -893,9 +895,9 @@ public class MultiProtocolURI implements Serializable { if (isSMB()) try { return getSmbFile().isHidden(); } catch (SmbException e) { - throw new IOException("SMB.isHidden SmbException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.isHidden SmbException (" + e.getMessage() + ") for " + this.toString()); } catch (MalformedURLException e) { - throw new IOException("SMB.isHidden MalformedURLException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.isHidden MalformedURLException (" + e.getMessage() + ") for " + this.toString()); } return false; } @@ -905,9 +907,9 @@ public class MultiProtocolURI implements Serializable { if (isSMB()) try { return getSmbFile().isDirectory(); } catch (SmbException e) { - throw new IOException("SMB.isDirectory SmbException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.isDirectory SmbException (" + e.getMessage() + ") for " + this.toString()); } catch (MalformedURLException e) { - throw new IOException("SMB.isDirectory MalformedURLException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.isDirectory MalformedURLException (" + e.getMessage() + ") for " + this.toString()); } return false; } @@ -917,9 +919,9 @@ public class MultiProtocolURI implements Serializable { if (isSMB()) try { return getSmbFile().length(); } catch (SmbException e) { - throw new IOException("SMB.length SmbException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.length SmbException (" + e.getMessage() + ") for " + this.toString()); } catch (MalformedURLException e) { - throw new IOException("SMB.length MalformedURLException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.length MalformedURLException (" + e.getMessage() + ") for " + this.toString()); } return 0; } @@ -929,9 +931,9 @@ public class MultiProtocolURI implements Serializable { if (isSMB()) try { return getSmbFile().lastModified(); } catch (SmbException e) { - throw new IOException("SMB.lastModified SmbException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.lastModified SmbException (" + e.getMessage() + ") for " + this.toString()); } catch (MalformedURLException e) { - throw new IOException("SMB.lastModified MalformedURLException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.lastModified MalformedURLException (" + e.getMessage() + ") for " + this.toString()); } return 0; } @@ -941,7 +943,7 @@ public class MultiProtocolURI implements Serializable { if (isSMB()) try { return getSmbFile().getName(); } catch (MalformedURLException e) { - throw new IOException("SMB.getName MalformedURLException for " + this.toString() + ": " + e.getMessage()); + throw new IOException("SMB.getName MalformedURLException (" + e.getMessage() + ") for " + this.toString() ); } return null; } diff --git a/source/net/yacy/kelondro/util/FileUtils.java b/source/net/yacy/kelondro/util/FileUtils.java index 1f90bbb19..e7a2a986a 100644 --- a/source/net/yacy/kelondro/util/FileUtils.java +++ b/source/net/yacy/kelondro/util/FileUtils.java @@ -85,8 +85,8 @@ public final class FileUtils { * @see #copy(File source, File dest) */ public static long copy(final InputStream source, final OutputStream dest, final long count) throws IOException { - assert count == -1 || count > 0 : "precondition violated: count == " + count + " (nothing to copy)"; - if(count == 0) { + assert count < 0 || count > 0 : "precondition violated: count == " + count + " (nothing to copy)"; + if (count == 0) { // no bytes to copy return 0; } @@ -101,7 +101,7 @@ public final class FileUtils { total += c; if (count > 0) { - chunkSize = (int) Math.min(count-total, DEFAULT_BUFFER_SIZE); + chunkSize = (int) Math.min(count - total, DEFAULT_BUFFER_SIZE); if (chunkSize == 0) break; } @@ -275,7 +275,7 @@ public final class FileUtils { return b; } final ByteArrayOutputStream baos = new ByteArrayOutputStream(512); - copy(source, baos, count); + copy(source, baos); baos.close(); return baos.toByteArray(); }