now writing image alt texts and (camelcase-)parsed urls into a text

search field for a better image retrieval
pull/1/head
Michael Peter Christen 12 years ago
parent c36720d45f
commit f9d859f5dc

@ -36,6 +36,7 @@ import java.io.Serializable;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
@ -832,18 +833,16 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
* resulting words are not ordered by appearance, but all
* @return
*/
private static String toTokens(String s) {
// unesape string
String t = s;
private static String toTokens(final String s) {
// remove all non-character & non-number
final StringBuilder sb = new StringBuilder(t.length());
final StringBuilder sb = new StringBuilder(s.length());
char c;
for (int i = 0; i < t.length(); i++) {
c = t.charAt(i);
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
if ((c >= '0' && c <='9') || (c >= 'a' && c <='z') || (c >= 'A' && c <='Z')) sb.append(c); else sb.append(' ');
}
t = sb.toString();
String t = sb.toString();
// remove all double-spaces
int p;
@ -851,39 +850,39 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
// split the string into tokens and add all camel-case splitting
final String[] u = CommonPattern.SPACE.split(t);
final Map<String, Object> token = new LinkedHashMap<String, Object>();
final Set<String> token = new LinkedHashSet<String>();
for (final String r: u) {
token.putAll(parseCamelCase(r));
token.addAll(parseCamelCase(r));
}
// construct a String again
for (final String v: token.keySet()) if (v.length() > 1) s += " " + v;
return s;
for (final String v: token) if (v.length() > 1) t += ' ' + v;
return t;
}
public static enum CharType { low, high, number; }
public static Map<String, Object> parseCamelCase(String s) {
final Map<String, Object> token = new LinkedHashMap<String, Object>();
public static Set<String> parseCamelCase(String s) {
final Set<String> token = new LinkedHashSet<String>();
if (s.isEmpty()) return token;
int p = 0;
CharType type = charType(s.charAt(0)), nct = type;
while (p < s.length()) {
// search for first appearance of an character that is a upper-case
while (p < s.length() && (nct = charType(s.charAt(p))) == type) p++;
if (p >= s.length()) { token.put(s, new Object()); break; }
if (p >= s.length()) { token.add(s); break; }
if (nct == CharType.low) {
type = CharType.low;
p++; continue;
}
// the char type has changed
token.put(s.substring(0, p), new Object());
token.add(s.substring(0, p));
s = s.substring(p);
p = 0;
type = nct;
}
token.put(s, new Object());
token.add(s);
return token;
}

@ -298,6 +298,14 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
}
}
public static final Date parseGSAFS(final String datestring) {
try {
return FORMAT_GSAFS.parse(datestring);
} catch (ParseException e) {
return null;
}
}
/** Initialization of static formats */
static {
// 2-digit dates are automatically parsed by SimpleDateFormat,

@ -443,6 +443,9 @@ public final class Switchboard extends serverSwitch {
}) {
SchemaConfiguration.Entry entry = solrCollectionConfigurationWork.get(field.name()); entry.setEnable(true); solrCollectionConfigurationWork.put(field.name(), entry);
}
// activate some fields that are necessary here
solrCollectionConfigurationWork.get(CollectionSchema.images_urlstub_sxt.getSolrFieldName()).setEnable(true);
solrCollectionConfigurationWork.commit();
} catch (IOException e) {Log.logException(e);}

@ -35,6 +35,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
@ -65,6 +66,7 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.SentenceReader;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference;
@ -590,6 +592,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final String[] imgalts = new String[imagesc.size()];
int withalt = 0;
int i = 0;
LinkedHashSet<String> images_text_map = new LinkedHashSet<String>();
for (final ImageEntry ie: imagesc) {
final MultiProtocolURI uri = ie.url();
inboundLinks.remove(uri);
@ -601,9 +604,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
imgprots.add(protocol);
imgstubs[i] = uri.toString().substring(protocol.length() + 3);
imgalts[i] = ie.alt();
if (ie.alt() != null && ie.alt().length() > 0) withalt++;
for (String it: uri.toTokens().split(" ")) images_text_map.add(it);
if (ie.alt() != null && ie.alt().length() > 0) {
SentenceReader sr = new SentenceReader(ie.alt());
while (sr.hasNext()) images_text_map.add(sr.next().toString());
withalt++;
}
i++;
}
StringBuilder images_text = new StringBuilder(images_text_map.size() * 6 + 1);
for (String s: images_text_map) images_text.append(s.trim()).append(' ');
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, imagesc.size());
if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots));
if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs);
@ -612,6 +622,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths);
if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt);
if (allAttr || contains(CollectionSchema.images_text_t)) add(doc, CollectionSchema.images_text_t, images_text.toString().trim());
// style sheets
if (allAttr || contains(CollectionSchema.css_tag_sxt)) {

Loading…
Cancel
Save