// MediaSnippet.java
// -----------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.search.snippet ;
import java.io.IOException ;
import java.util.ArrayList ;
import java.util.Comparator ;
import java.util.Date ;
import java.util.Iterator ;
import java.util.List ;
import java.util.Map ;
import java.util.SortedMap ;
import java.util.SortedSet ;
import java.util.TreeSet ;
import net.yacy.cora.document.ASCII ;
import net.yacy.cora.document.analysis.Classification ;
import net.yacy.cora.document.analysis.Classification.ContentDomain ;
import net.yacy.cora.federate.yacy.CacheStrategy ;
import net.yacy.cora.order.Base64Order ;
import net.yacy.cora.storage.HandleSet ;
import net.yacy.cora.util.NumberTools ;
import net.yacy.cora.util.SpaceExceededException ;
import net.yacy.crawler.data.ZURL.FailCategory ;
import net.yacy.crawler.retrieval.Request ;
import net.yacy.document.Document ;
import net.yacy.document.Parser ;
import net.yacy.document.WordTokenizer ;
import net.yacy.document.parser.html.ImageEntry ;
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.index.RowHandleSet ;
import net.yacy.kelondro.logging.Log ;
import net.yacy.kelondro.util.ByteArray ;
import net.yacy.repository.Blacklist.BlacklistType ;
import net.yacy.search.Switchboard ;
public class MediaSnippet implements Comparable < MediaSnippet > , Comparator < MediaSnippet > {
public ContentDomain type ;
public DigestURI href , source ;
public String name , attr , mime ;
public long ranking ;
public int width , height ;
public long fileSize ;
public MediaSnippet ( final ContentDomain type , final DigestURI href , final String mime , final String name , final long fileSize , final String attr , final long ranking , final DigestURI source ) {
this . type = type ;
this . href = href ;
this . mime = mime ;
this . fileSize = fileSize ;
this . source = source ; // the web page where the media resource appeared
this . name = name ;
this . attr = attr ;
this . width = - 1 ;
this . height = - 1 ;
int p = 0 ;
if ( attr ! = null & & ( p = attr . indexOf ( " x " , 0 ) ) > 0 ) {
this . width = NumberTools . parseIntDecSubstring ( attr , 0 , p ) ;
this . height = NumberTools . parseIntDecSubstring ( attr , p + 3 ) ;
}
this . ranking = ranking ; // the smaller the better! small values should be shown first
if ( ( this . name = = null ) | | ( this . name . isEmpty ( ) ) ) this . name = "_" ;
if ( ( this . attr = = null ) | | ( this . attr . isEmpty ( ) ) ) this . attr = "_" ;
}
public MediaSnippet ( final ContentDomain type , final DigestURI href , final String mime , final String name , final long fileSize , final int width , final int height , final long ranking , final DigestURI source ) {
this . type = type ;
this . href = href ;
this . mime = mime ;
this . fileSize = fileSize ;
this . source = source ; // the web page where the media resource appeared
this . name = name ;
this . attr = width + " x " + height ;
this . width = width ;
this . height = height ;
this . ranking = ranking ; // the smaller the better! small values should be shown first
if ( ( this . name = = null ) | | ( this . name . isEmpty ( ) ) ) this . name = "_" ;
if ( ( this . attr = = null ) | | ( this . attr . isEmpty ( ) ) ) this . attr = "_" ;
}
private int hashCache = Integer . MIN_VALUE ; // if this is used in a compare method many times, a cache is useful
@Override
public int hashCode ( ) {
if ( this . hashCache = = Integer . MIN_VALUE ) {
this . hashCache = ByteArray . hashCode ( this . href . hash ( ) ) ;
}
return this . hashCache ;
}
@Override
public String toString ( ) {
return ASCII . String ( this . href . hash ( ) ) ;
}
@Override
public boolean equals ( final Object obj ) {
if ( this = = obj ) return true ;
if ( obj = = null ) return false ;
if ( ! ( obj instanceof MediaSnippet ) ) return false ;
final MediaSnippet other = ( MediaSnippet ) obj ;
return Base64Order . enhancedCoder . equal ( this . href . hash ( ) , other . href . hash ( ) ) ;
}
@Override
public int compareTo ( final MediaSnippet o ) {
return Base64Order . enhancedCoder . compare ( this . href . hash ( ) , o . href . hash ( ) ) ;
}
@Override
public int compare ( final MediaSnippet o1 , final MediaSnippet o2 ) {
return o1 . compareTo ( o2 ) ;
}
public static List < MediaSnippet > retrieveMediaSnippets ( final DigestURI url , final HandleSet queryhashes , final Classification . ContentDomain mediatype , final CacheStrategy cacheStrategy , final boolean reindexing ) {
if ( queryhashes . isEmpty ( ) ) {
Log . logFine ( "snippet fetch" , "no query hashes given for url " + url ) ;
return new ArrayList < MediaSnippet > ( ) ;
}
Document document ;
try {
document = Document . mergeDocuments ( url , null , Switchboard . getSwitchboard ( ) . loader . loadDocuments ( Switchboard . getSwitchboard ( ) . loader . request ( url , false , reindexing ) , cacheStrategy , Integer . MAX_VALUE , BlacklistType . SEARCH , TextSnippet . snippetMinLoadDelay ) ) ;
} catch ( final IOException e ) {
Log . logFine ( "snippet fetch" , "load error: " + e . getMessage ( ) ) ;
return new ArrayList < MediaSnippet > ( ) ;
} catch ( final Parser . Failure e ) {
Log . logFine ( "snippet fetch" , "parser error: " + e . getMessage ( ) ) ;
return new ArrayList < MediaSnippet > ( ) ;
}
final ArrayList < MediaSnippet > a = new ArrayList < MediaSnippet > ( ) ;
if ( document ! = null ) {
if ( ( mediatype = = ContentDomain . ALL ) | | ( mediatype = = ContentDomain . AUDIO ) ) a . addAll ( computeMediaSnippets ( url , document , queryhashes , ContentDomain . AUDIO ) ) ;
if ( ( mediatype = = ContentDomain . ALL ) | | ( mediatype = = ContentDomain . VIDEO ) ) a . addAll ( computeMediaSnippets ( url , document , queryhashes , ContentDomain . VIDEO ) ) ;
if ( ( mediatype = = ContentDomain . ALL ) | | ( mediatype = = ContentDomain . APP ) ) a . addAll ( computeMediaSnippets ( url , document , queryhashes , ContentDomain . APP ) ) ;
if ( ( mediatype = = ContentDomain . ALL ) | | ( mediatype = = ContentDomain . IMAGE ) ) a . addAll ( computeImageSnippets ( url , document , queryhashes ) ) ;
}
return a ;
}
public static List < MediaSnippet > computeMediaSnippets ( final DigestURI source , final Document document , final HandleSet queryhashes , final ContentDomain mediatype ) {
if ( document = = null ) return new ArrayList < MediaSnippet > ( ) ;
Map < DigestURI , String > media = null ;
if ( mediatype = = ContentDomain . AUDIO ) media = document . getAudiolinks ( ) ;
else if ( mediatype = = ContentDomain . VIDEO ) media = document . getVideolinks ( ) ;
else if ( mediatype = = ContentDomain . APP ) media = document . getApplinks ( ) ;
if ( media = = null ) return null ;
final Iterator < Map . Entry < DigestURI , String > > i = media . entrySet ( ) . iterator ( ) ;
Map . Entry < DigestURI , String > entry ;
DigestURI url ;
String desc ;
final List < MediaSnippet > result = new ArrayList < MediaSnippet > ( ) ;
while ( i . hasNext ( ) ) {
entry = i . next ( ) ;
url = entry . getKey ( ) ;
desc = entry . getValue ( ) ;
if ( isUrlBlacklisted ( BlacklistType . SEARCH , url ) ) continue ;
final int ranking = removeAppearanceHashes ( url . toNormalform ( true ) , queryhashes ) . size ( ) +
removeAppearanceHashes ( desc , queryhashes ) . size ( ) ;
if ( ranking < 2 * queryhashes . size ( ) ) {
result . add ( new MediaSnippet ( mediatype , url , Classification . url2mime ( url ) , desc , document . getTextLength ( ) , null , ranking , source ) ) ;
}
}
return result ;
}
public static List < MediaSnippet > computeImageSnippets ( final DigestURI source , final Document document , final HandleSet queryhashes ) {
final SortedSet < ImageEntry > images = new TreeSet < ImageEntry > ( ) ;
images . addAll ( document . getImages ( ) . values ( ) ) ; // iterates images in descending size order!
// a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode()
final Iterator < ImageEntry > i = images . iterator ( ) ;
ImageEntry ientry ;
DigestURI url ;
String desc ;
final List < MediaSnippet > result = new ArrayList < MediaSnippet > ( ) ;
while ( i . hasNext ( ) ) {
ientry = i . next ( ) ;
url = ientry . url ( ) ;
final String u = url . toString ( ) ;
if ( isUrlBlacklisted ( BlacklistType . SEARCH , url ) ) continue ;
if ( u . indexOf ( ".ico" , 0 ) > = 0 | | u . indexOf ( "favicon" , 0 ) > = 0 ) continue ;
if ( ientry . height ( ) > 0 & & ientry . height ( ) < 32 ) continue ;
if ( ientry . width ( ) > 0 & & ientry . width ( ) < 32 ) continue ;
desc = ientry . alt ( ) ;
final int appcount = queryhashes . size ( ) * 2 -
removeAppearanceHashes ( url . toNormalform ( true ) , queryhashes ) . size ( ) -
removeAppearanceHashes ( desc , queryhashes ) . size ( ) ;
final long ranking = Long . MAX_VALUE - ( ientry . height ( ) + 1 ) * ( ientry . width ( ) + 1 ) * ( appcount + 1 ) ;
result . add ( new MediaSnippet ( ContentDomain . IMAGE , url , Classification . url2mime ( url ) , desc , ientry . fileSize ( ) , ientry . width ( ) , ientry . height ( ) , ranking , source ) ) ;
}
return result ;
}
/ * *
* removed all word hashes that can be computed as tokens from a given sentence from a given hash set
* @param sentence
* @param queryhashes
* @return the given hash set minus the hashes from the tokenization of the given sentence
* /
private static HandleSet removeAppearanceHashes ( final String sentence , final HandleSet queryhashes ) {
// remove all hashes that appear in the sentence
if ( sentence = = null ) return queryhashes ;
final SortedMap < byte [ ] , Integer > hs = WordTokenizer . hashSentence ( sentence , null , 100 ) ;
final Iterator < byte [ ] > j = queryhashes . iterator ( ) ;
byte [ ] hash ;
Integer pos ;
final HandleSet remaininghashes = new RowHandleSet ( queryhashes . keylen ( ) , queryhashes . comparator ( ) , queryhashes . size ( ) ) ;
while ( j . hasNext ( ) ) {
hash = j . next ( ) ;
pos = hs . get ( hash ) ;
if ( pos = = null ) {
try {
remaininghashes . put ( hash ) ;
} catch ( final SpaceExceededException e ) {
Log . logException ( e ) ;
}
}
}
return remaininghashes ;
}
/ * *
* Checks wether given URL is in blacklist for given blacklist type
*
* @param url The URL to check
* @param blacklistType Type of blacklist ( see class Blacklist , BLACKLIST_FOO )
* @return isBlacklisted Wether the given URL is blacklisted
* /
private static boolean isUrlBlacklisted ( final BlacklistType blacklistType , final DigestURI url ) {
// Default is not blacklisted
boolean isBlacklisted = false ;
// check if url is in blacklist
if ( Switchboard . urlBlacklist . isListed ( blacklistType , url . getHost ( ) . toLowerCase ( ) , url . getFile ( ) ) ) {
Switchboard . getSwitchboard ( ) . crawlQueues . errorURL . push ( new Request ( url , null ) , ASCII . getBytes ( Switchboard . getSwitchboard ( ) . peers . mySeed ( ) . hash ) , new Date ( ) , 1 , FailCategory . FINAL_LOAD_CONTEXT , "url in blacklist" , - 1 ) ;
Log . logFine ( "snippet fetch" , "MEDIA-SNIPPET Rejecting URL '" + url . toString ( ) + "'. URL is in blacklist." ) ;
isBlacklisted = true ;
}
// Return result
return isBlacklisted ;
}
}