now writing image alt texts and (camelcase-)parsed urls into a text

search field for a better image retrieval
pull/1/head
Michael Peter Christen 12 years ago
parent c36720d45f
commit f9d859f5dc

@ -36,6 +36,7 @@ import java.io.Serializable;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
@ -832,18 +833,16 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
* resulting words are not ordered by appearance, but all * resulting words are not ordered by appearance, but all
* @return * @return
*/ */
private static String toTokens(String s) { private static String toTokens(final String s) {
// unesape string
String t = s;
// remove all non-character & non-number // remove all non-character & non-number
final StringBuilder sb = new StringBuilder(t.length()); final StringBuilder sb = new StringBuilder(s.length());
char c; char c;
for (int i = 0; i < t.length(); i++) { for (int i = 0; i < s.length(); i++) {
c = t.charAt(i); c = s.charAt(i);
if ((c >= '0' && c <='9') || (c >= 'a' && c <='z') || (c >= 'A' && c <='Z')) sb.append(c); else sb.append(' '); if ((c >= '0' && c <='9') || (c >= 'a' && c <='z') || (c >= 'A' && c <='Z')) sb.append(c); else sb.append(' ');
} }
t = sb.toString();
String t = sb.toString();
// remove all double-spaces // remove all double-spaces
int p; int p;
@ -851,39 +850,39 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
// split the string into tokens and add all camel-case splitting // split the string into tokens and add all camel-case splitting
final String[] u = CommonPattern.SPACE.split(t); final String[] u = CommonPattern.SPACE.split(t);
final Map<String, Object> token = new LinkedHashMap<String, Object>(); final Set<String> token = new LinkedHashSet<String>();
for (final String r: u) { for (final String r: u) {
token.putAll(parseCamelCase(r)); token.addAll(parseCamelCase(r));
} }
// construct a String again // construct a String again
for (final String v: token.keySet()) if (v.length() > 1) s += " " + v; for (final String v: token) if (v.length() > 1) t += ' ' + v;
return s; return t;
} }
public static enum CharType { low, high, number; } public static enum CharType { low, high, number; }
public static Map<String, Object> parseCamelCase(String s) { public static Set<String> parseCamelCase(String s) {
final Map<String, Object> token = new LinkedHashMap<String, Object>(); final Set<String> token = new LinkedHashSet<String>();
if (s.isEmpty()) return token; if (s.isEmpty()) return token;
int p = 0; int p = 0;
CharType type = charType(s.charAt(0)), nct = type; CharType type = charType(s.charAt(0)), nct = type;
while (p < s.length()) { while (p < s.length()) {
// search for first appearance of an character that is a upper-case // search for first appearance of an character that is a upper-case
while (p < s.length() && (nct = charType(s.charAt(p))) == type) p++; while (p < s.length() && (nct = charType(s.charAt(p))) == type) p++;
if (p >= s.length()) { token.put(s, new Object()); break; } if (p >= s.length()) { token.add(s); break; }
if (nct == CharType.low) { if (nct == CharType.low) {
type = CharType.low; type = CharType.low;
p++; continue; p++; continue;
} }
// the char type has changed // the char type has changed
token.put(s.substring(0, p), new Object()); token.add(s.substring(0, p));
s = s.substring(p); s = s.substring(p);
p = 0; p = 0;
type = nct; type = nct;
} }
token.put(s, new Object()); token.add(s);
return token; return token;
} }

@ -297,6 +297,14 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
return s; return s;
} }
} }
public static final Date parseGSAFS(final String datestring) {
try {
return FORMAT_GSAFS.parse(datestring);
} catch (ParseException e) {
return null;
}
}
/** Initialization of static formats */ /** Initialization of static formats */
static { static {

@ -443,6 +443,9 @@ public final class Switchboard extends serverSwitch {
}) { }) {
SchemaConfiguration.Entry entry = solrCollectionConfigurationWork.get(field.name()); entry.setEnable(true); solrCollectionConfigurationWork.put(field.name(), entry); SchemaConfiguration.Entry entry = solrCollectionConfigurationWork.get(field.name()); entry.setEnable(true); solrCollectionConfigurationWork.put(field.name(), entry);
} }
// activate some fields that are necessary here
solrCollectionConfigurationWork.get(CollectionSchema.images_urlstub_sxt.getSolrFieldName()).setEnable(true);
solrCollectionConfigurationWork.commit(); solrCollectionConfigurationWork.commit();
} catch (IOException e) {Log.logException(e);} } catch (IOException e) {Log.logException(e);}

@ -35,6 +35,7 @@ import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -65,6 +66,7 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.SentenceReader;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReference;
@ -590,6 +592,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final String[] imgalts = new String[imagesc.size()]; final String[] imgalts = new String[imagesc.size()];
int withalt = 0; int withalt = 0;
int i = 0; int i = 0;
LinkedHashSet<String> images_text_map = new LinkedHashSet<String>();
for (final ImageEntry ie: imagesc) { for (final ImageEntry ie: imagesc) {
final MultiProtocolURI uri = ie.url(); final MultiProtocolURI uri = ie.url();
inboundLinks.remove(uri); inboundLinks.remove(uri);
@ -601,9 +604,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
imgprots.add(protocol); imgprots.add(protocol);
imgstubs[i] = uri.toString().substring(protocol.length() + 3); imgstubs[i] = uri.toString().substring(protocol.length() + 3);
imgalts[i] = ie.alt(); imgalts[i] = ie.alt();
if (ie.alt() != null && ie.alt().length() > 0) withalt++; for (String it: uri.toTokens().split(" ")) images_text_map.add(it);
if (ie.alt() != null && ie.alt().length() > 0) {
SentenceReader sr = new SentenceReader(ie.alt());
while (sr.hasNext()) images_text_map.add(sr.next().toString());
withalt++;
}
i++; i++;
} }
StringBuilder images_text = new StringBuilder(images_text_map.size() * 6 + 1);
for (String s: images_text_map) images_text.append(s.trim()).append(' ');
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, imagesc.size()); if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, imagesc.size());
if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots)); if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots));
if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs); if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs);
@ -612,6 +622,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths); if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths);
if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels); if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt); if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt);
if (allAttr || contains(CollectionSchema.images_text_t)) add(doc, CollectionSchema.images_text_t, images_text.toString().trim());
// style sheets // style sheets
if (allAttr || contains(CollectionSchema.css_tag_sxt)) { if (allAttr || contains(CollectionSchema.css_tag_sxt)) {

Loading…
Cancel
Save