Merge branch 'master' of github.com:yacy/yacy_search_server

pull/23/head
Michael Peter Christen 10 years ago
commit a44cc774d0

@ -85,7 +85,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<td valign="top" align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
<td valign="top">#[type]#
#(isCrawlerStart)#::<br/><br/>
<a href="#[url]#" title="clone"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>::<br/><br/>
<a href="#[url]#" title="clone" target="_parent"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>::<br/><br/>
<script>
var f = document.createElement("form");
@ -93,7 +93,9 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
f.setAttribute("enctype", "multipart/form-data");
f.setAttribute("accept-charset", "UTF-8");
f.setAttribute("action", "#[servlet]#");
f.setAttribute("target", "_parent");
f.setAttribute("id", "#[pk]#");
f.setAttribute("name", "#[pk]#");
#{attr}#
var e = document.createElement("input");
e.setAttribute("type", "hidden");

@ -316,7 +316,7 @@ public class yacysearchitem {
final String license = URLLicense.aquireLicense(image.imageUrl); // this is just the license key to get the image forwarded through the YaCy thumbnail viewer, not an actual lawful license
//sb.loader.loadIfNotExistBackground(image.imageUrl, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
prop.putHTML("content_item_hrefCache", "ViewImage." + ("gif.png.svg".contains(imageUrlExt) ? imageUrlExt : "png") + "?maxwidth=128&maxheight=128&code="+license+"&isStatic=true&quadratic=&url=" + imageUrlstring);
prop.putHTML("content_item_hrefCache", "ViewImage." + (!imageUrlExt.isEmpty() && "gif.png.svg".contains(imageUrlExt) ? imageUrlExt : "png") + "?maxwidth=128&maxheight=128&code="+license+"&isStatic=true&quadratic=&url=" + imageUrlstring);
prop.putHTML("content_item_href", imageUrlstring);
prop.putHTML("content_item_target", target);
prop.put("content_item_code", license);

@ -945,6 +945,14 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return this.searchpart;
}
/**
* Returns a search part parameter map key=value
* in internal url encoded format
* for unescaped return values
* @see #getAttributes()
*
* @return key name value
*/
public Map<String, String> getSearchpartMap() {
if (this.searchpart == null) return null;
this.searchpart = this.searchpart.replaceAll("&amp;", "&");
@ -1027,6 +1035,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
/**
* Evaluates url search part and returns attribute '=' value pairs
* the returned values are in clear text (without urlencoding).
*
* To get the parameter map as (url-encoded key and values)
* @see getSearchpartMap()
*
* @return map key=attribue name, value=string after '='
*/
@ -1037,9 +1049,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
for (final String element : questp) {
int p = element.indexOf('=');
if (p != -1) {
map.put(element.substring(0, p), element.substring(p + 1));
map.put(unescape(element.substring(0, p)), unescape(element.substring(p + 1)));
} else {
if (!element.isEmpty()) map.put(element, "");
if (!element.isEmpty()) map.put(unescape(element), "");
}
}
return map;

@ -35,6 +35,7 @@ import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
@ -53,10 +54,11 @@ public class RecrawlBusyThread extends AbstractBusyThread {
private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled
private int chunkstart = 0;
private int chunksize = 200;
private final int chunksize;
final Switchboard sb;
private final Set<DigestURL> urlstack; // buffer of urls to recrawl
public long urlsfound = 0;
private String solrSortBy;
public RecrawlBusyThread(Switchboard xsb) {
super(3000, 1000); // set lower limits of cycle delay
@ -66,6 +68,10 @@ public class RecrawlBusyThread extends AbstractBusyThread {
this.sb = xsb;
urlstack = new HashSet<DigestURL>();
// workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
// org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc";
this.chunksize = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 200);
}
/**
@ -142,8 +148,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
*/
@Override
public boolean job() {
// other crawls are running, do nothing
if (sb.crawlQueues.coreCrawlJobSize() > 0) {
// more than chunksize crawls are running, do nothing
if (sb.crawlQueues.coreCrawlJobSize() > this.chunksize) {
return false;
}
@ -168,7 +174,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
try {
// query all or only httpstatus=200 depending on includefailed flag
docList = solrConnector.getDocumentListByQuery(this.includefailed ? currentQuery : currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
CollectionSchema.load_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
this.urlsfound = docList.getNumFound();
} catch (Throwable e) {
this.urlsfound = 0;

@ -57,7 +57,6 @@ import net.yacy.document.SentenceReader;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.Evaluation.Element;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.ISO639;
@ -552,12 +551,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
href = CharacterCoding.html2unicode(href);
AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) {
// special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1);
this.images.add(ie);
} else {
if (followDenied()) {
String rel = tag.opts.getProperty("rel", EMPTY_STRING);
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
@ -569,7 +562,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
recursiveParse(url, tag.content.getChars());
this.anchors.add(url);
}
}
this.evaluationScores.match(Element.apath, href);
}
final String h;

@ -291,8 +291,7 @@ public final class TransformerWriter extends Writer {
}
if (this.transformer != null && this.transformer.isTag0(tagname)) {
// this single tag is collected at once here
char[] b = new char[0];
b = this.transformer.transformTag0(tag, quotechar);
char[] b = this.transformer.transformTag0(tag, quotechar);
return b;
} else if ((this.scraper != null && this.scraper.isTag1(tagname)) ||
(this.transformer != null && this.transformer.isTag1(tagname))) {

@ -65,18 +65,17 @@ import com.drew.metadata.Metadata;
import com.drew.metadata.Tag;
import com.drew.metadata.exif.GpsDirectory;
public class genericImageParser extends AbstractParser implements Parser {
/**
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
* Parser for images, bmp and jpeg and all supported by the Java Image I/O API
* by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered)
* http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html
*/
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
public class genericImageParser extends AbstractParser implements Parser {
public genericImageParser() {
super("Generic Image Parser");
SUPPORTED_EXTENSIONS.add("bmp");
// by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered)
// http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html
SUPPORTED_EXTENSIONS.add("jpe"); // not listed in ImageIO extension but sometimes uses for jpeg
SUPPORTED_EXTENSIONS.addAll(Arrays.asList(ImageIO.getReaderFileSuffixes()));
@ -85,10 +84,6 @@ public class genericImageParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.addAll(Arrays.asList(ImageIO.getReaderMIMETypes()));
}
public genericImageParser() {
super("Generic Image Parser");
}
@Override
public Document[] parse(
final AnchorURL location,
@ -129,6 +124,13 @@ public class genericImageParser extends AbstractParser implements Parser {
byte[] b;
try {
b = FileUtils.read(source);
// check jpeg file signature (magic number FF D8 FF)
if (b.length < 3
|| (b[0] != (byte) 0xFF) // cast to signed byte (-1)
|| (b[1] != (byte) 0xD8) //cast to signed byte (-40)
|| (b[2] != (byte) 0xFF)) {
throw new Parser.Failure("File has no jpeg signature", location);
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
throw new Parser.Failure(e.getMessage(), location);
@ -226,7 +228,7 @@ public class genericImageParser extends AbstractParser implements Parser {
return SUPPORTED_EXTENSIONS;
}
public static ImageInfo parseJavaImage(
private ImageInfo parseJavaImage(
final AnchorURL location,
final InputStream sourceStream) throws Parser.Failure {
BufferedImage image = null;
@ -241,7 +243,7 @@ public class genericImageParser extends AbstractParser implements Parser {
return parseJavaImage(location, image);
}
public static ImageInfo parseJavaImage(
private ImageInfo parseJavaImage(
final AnchorURL location,
final BufferedImage image) {
final ImageInfo ii = new ImageInfo(location);
@ -278,7 +280,7 @@ public class genericImageParser extends AbstractParser implements Parser {
return ii;
}
public static class ImageInfo {
private class ImageInfo {
public AnchorURL location;
public BufferedImage image;
public StringBuilder info;

@ -917,14 +917,22 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (document.getContentDomain() == ContentDomain.IMAGE) {
// add image pixel size if known
Iterator<ImageEntry> imgit = document.getImages().values().iterator();
if (imgit.hasNext()) {
List<Integer> heights = new ArrayList<>();
List<Integer> widths = new ArrayList<>();
List<Integer> pixels = new ArrayList<>();
while (imgit.hasNext()) {
ImageEntry img = imgit.next();
int imgpixels = (img.height() < 0 || img.width() < 0) ? -1 : img.height() * img.width();
if (imgpixels > 0) {
if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, img.height());
if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, img.width());
if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
if (imgpixels > 0 && (allAttr || (contains(CollectionSchema.images_height_val) && contains(CollectionSchema.images_width_val) && contains(CollectionSchema.images_pixel_val)))) {
heights.add(img.height());
widths.add(img.width());
pixels.add(imgpixels);
}
}
if (heights.size() > 0) {
add(doc, CollectionSchema.images_height_val, heights);
add(doc, CollectionSchema.images_width_val, widths);
add(doc, CollectionSchema.images_pixel_val, pixels);
}
if (allAttr || contains(CollectionSchema.images_text_t)) {

Loading…
Cancel
Save