Optimize internal imagequery focus on using content_type to select images

(in favor of url file extension)
pull/23/head
reger 9 years ago
parent 81f53fc83a
commit a58ee49307

@ -330,16 +330,29 @@ public class QueryGoal {
for (final byte[] b: blues) this.include_hashes.remove(b);
}
/**
* Generate a Solr filter query to receive valid urls
*
* This filters out error-urls.
* On noimages=true a filter is added to exclude links to images
* using the content_type (as well as urls with common image file extension)
*
* @param noimages true if filter for images should be included
* @return Solr filter query
*/
public List<String> collectionTextFilterQuery(boolean noimages) {
final ArrayList<String> fqs = new ArrayList<>();
// add filter to prevent that results come from failed urls
fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200");
if (noimages) fqs.add("-" + CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif)");
if (noimages) {
fqs.add("-" + CollectionSchema.content_type.getSolrFieldName() + ":(image/*)");
fqs.add("-" + CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif)");
}
return fqs;
}
public StringBuilder collectionTextQuery() {
// parse special requests
@ -348,16 +361,27 @@ public class QueryGoal {
// add goal query
return getGoalQuery();
}
/**
* Generate a Solr filter query to receive valid image results.
*
* This filters error-urls out and includes urls with mime image/* as well
* as urls with links to images.
* We use the mime (image/*) only to find images as the parser assigned the
* best mime to index documents. This applies also to parsed file systems.
* This ensures that no text urls with image-fileextension is returned
* (as some large internet sites like to use such urls)
*
* @return Solr filter query for image urls
*/
public List<String> collectionImageFilterQuery() {
final ArrayList<String> fqs = new ArrayList<>();
// add filter to prevent that results come from failed urls
fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200");
fqs.add(
CollectionSchema.images_urlstub_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM + " OR " +
CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif) OR " +
CollectionSchema.content_type.getSolrFieldName() + ":(image/*)");
CollectionSchema.content_type.getSolrFieldName() + ":(image/*) OR " +
CollectionSchema.images_urlstub_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
return fqs;
}

@ -419,7 +419,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final DigestURL digestURL = document.dc_source();
boolean allAttr = this.isEmpty();
String url = addURIAttributes(doc, allAttr, digestURL);
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, new String[]{document.dc_format()});
add(doc, CollectionSchema.content_type, new String[]{document.dc_format()}); // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check)
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
String host = digestURL.getHost();
@ -2028,9 +2028,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final SolrInputDocument doc = new SolrInputDocument();
String url = configuration.addURIAttributes(doc, allAttr, this.getDigestURL());
if (allAttr || configuration.contains(CollectionSchema.content_type)) configuration.add(doc, CollectionSchema.content_type, new String[]{Classification.url2mime(this.digestURL)});
// content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check)
CollectionSchema.content_type.add(doc, new String[]{Classification.url2mime(this.digestURL)});
if (allAttr || configuration.contains(CollectionSchema.load_date_dt)) configuration.add(doc, CollectionSchema.load_date_dt, getFailDate());
if (allAttr || configuration.contains(CollectionSchema.crawldepth_i)) configuration.add(doc, CollectionSchema.crawldepth_i, this.crawldepth);

Loading…
Cancel
Save