@ -47,6 +47,7 @@
import java.io.IOException ;
import java.io.InputStream ;
import java.io.UnsupportedEncodingException ;
import java.net.MalformedURLException ;
import java.net.URLDecoder ;
import java.net.URLEncoder ;
import java.util.Enumeration ;
@ -55,6 +56,7 @@ import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader ;
import de.anomic.http.httpc ;
import de.anomic.index.indexURLEntry ;
import de.anomic.net.URL ;
import de.anomic.plasma.plasmaHTCache ;
import de.anomic.plasma.plasmaParserDocument ;
import de.anomic.plasma.plasmaSwitchboard ;
@ -87,24 +89,21 @@ public class ViewFile {
serverObjects prop = new serverObjects ( ) ;
plasmaSwitchboard sb = ( plasmaSwitchboard ) env ;
if ( post ! = null & & post . containsKey ( "words" ) )
try {
prop . put ( "error_words" , URLEncoder . encode ( ( String ) post . get ( "words" ) , "UTF-8" ) ) ;
} catch ( UnsupportedEncodingException e1 ) {
// ignore this. this should not occure
}
// getting the url hash from which the content should be loaded
String urlHash = post . get ( "urlHash" , "" ) ;
if ( urlHash . equals ( "" ) ) {
prop . put ( "error" , 1 ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
String viewMode = post . get ( "viewMode" , "sentences" ) ;
if ( post ! = null & & post . containsKey ( "words" ) ) try {
prop . put ( "error_words" , URLEncoder . encode ( ( String ) post . get ( "words" ) , "UTF-8" ) ) ;
} catch ( UnsupportedEncodingException e1 ) {
// ignore this. this should not occure
}
String viewMode = post . get ( "viewMode" , "sentences" ) ;
URL url = null ;
String descr = "" ;
int wordCount = 0 ;
int size = 0 ;
// getting the url hash from which the content should be loaded
String urlHash = post . get ( "urlHash" , "" ) ;
if ( urlHash . length ( ) > 0 ) {
// getting the urlEntry that belongs to the url hash
indexURLEntry urlEntry = null ;
urlEntry = sb . urlPool . loadedURL . load ( urlHash , null ) ;
@ -113,196 +112,238 @@ public class ViewFile {
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
// gettin the url that belongs to the entry
// gettin the url that belongs to the entry
indexURLEntry . Components comp = urlEntry . comp ( ) ;
if ( ( comp = = null ) | | ( comp . url ( ) = = null ) ) {
prop . put ( "error" , 3 ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
prop . put ( "error" , 3 ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
url = comp . url ( ) ;
descr = comp . descr ( ) ;
urlEntry . wordCount ( ) ;
size = urlEntry . size ( ) ;
}
// alternatively, get the url simply from a url String
// this can be used as a simple tool to test the text parser
String urlString = post . get ( "url" , "" ) ;
if ( urlString . length ( ) > 0 ) try {
url = new URL ( urlString ) ;
} catch ( MalformedURLException e ) { }
if ( url = = null ) {
prop . put ( "error" , 1 ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
// loading the resource content as byte array
InputStream resource = null ;
long resourceLength = - 1 ;
IResourceInfo resInfo = null ;
String resMime = null ;
try {
// trying to load the resource body
resource = sb . cacheManager . getResourceContentStream ( url ) ;
resourceLength = sb . cacheManager . getResourceContentLength ( url ) ;
// if the resource body was not cached we try to load it from web
if ( resource = = null ) {
plasmaHTCache . Entry entry = null ;
try {
entry = sb . snippetCache . loadResourceFromWeb ( url , 5000 , false ) ;
} catch ( plasmaCrawlerException e ) {
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , e . getMessage ( ) ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
// loading the resource content as byte array
InputStream resource = null ;
long resourceLength = - 1 ;
IResourceInfo resInfo = null ;
String resMime = null ;
try {
// trying to load the resource body
resource = sb . cacheManager . getResourceContentStream ( comp . url ( ) ) ;
resourceLength = sb . cacheManager . getResourceContentLength ( comp . url ( ) ) ;
if ( entry ! = null ) {
resInfo = entry . getDocumentInfo ( ) ;
resource = sb . cacheManager . getResourceContentStream ( url ) ;
resourceLength = sb . cacheManager . getResourceContentLength ( url ) ;
}
// if the resource body was not cached we try to load it from web
if ( resource = = null ) {
plasmaHTCache . Entry entry = null ;
try {
entry = sb . snippetCache . loadResourceFromWeb ( comp . url ( ) , 5000 , false ) ;
} catch ( plasmaCrawlerException e ) {
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , e . getMessage ( ) ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , "No resource available" ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
}
if ( entry ! = null ) {
resInfo = entry . getDocumentInfo ( ) ;
resource = sb . cacheManager . getResourceContentStream ( comp . url ( ) ) ;
resourceLength = sb . cacheManager . getResourceContentLength ( comp . url ( ) ) ;
}
// try to load resource metadata
if ( resInfo = = null ) {
if ( resource = = null ) {
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , "No resource available" ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
// try to load the metadata from cache
try {
resInfo = sb . cacheManager . loadResourceInfo ( url ) ;
} catch ( Exception e ) {
/* ignore this */
}
// try to load resource metadata
// if the metadata where not cached try to load it from web
if ( resInfo = = null ) {
String protocol = url . getProtocol ( ) ;
if ( ! ( ( protocol . equals ( "http" ) | | protocol . equals ( "https" ) ) ) ) {
prop . put ( "error" , 6 ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
// try to load the metadata from cache
httpHeader responseHeader = httpc . whead ( url , url . getHost ( ) , 5000 , null , null , sb . remoteProxyConfig ) ;
if ( responseHeader = = null ) {
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , "Unable to load resource metadata." ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
try {
resInfo = sb . cacheManager . loadResourceInfo ( comp . url ( ) ) ;
} catch ( Exception e ) { /* ignore this */ }
// if the metadata where not cached try to load it from web
if ( resInfo = = null ) {
String protocol = comp . url ( ) . getProtocol ( ) ;
if ( ! ( ( protocol . equals ( "http" ) | | protocol . equals ( "https" ) ) ) ) {
prop . put ( "error" , 6 ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
httpHeader responseHeader = httpc . whead ( comp . url ( ) , comp . url ( ) . getHost ( ) , 5000 , null , null , sb . remoteProxyConfig ) ;
if ( responseHeader = = null ) {
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , "Unable to load resource metadata." ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
try {
resInfo = sb . cacheManager . getResourceInfoFactory ( ) . buildResourceInfoObj ( comp . url ( ) , responseHeader ) ;
} catch ( Exception e ) {
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , e . getMessage ( ) ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
resMime = responseHeader . mime ( ) ;
resInfo = sb . cacheManager . getResourceInfoFactory ( ) . buildResourceInfoObj ( url , responseHeader ) ;
} catch ( Exception e ) {
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , e . getMessage ( ) ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
} else {
resMime = resInfo . getMimeType ( ) ;
resMime = responseHeader . mime ( ) ;
}
} catch ( IOException e ) {
if ( resource ! = null ) try { resource . close ( ) ; } catch ( Exception ex ) { /* ignore this */ }
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , e . getMessage ( ) ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
if ( viewMode . equals ( "plain" ) ) {
// TODO: how to handle very large files here ?
String content ;
} else {
resMime = resInfo . getMimeType ( ) ;
}
} catch ( IOException e ) {
if ( resource ! = null )
try {
content = new String ( serverFileUtils . read ( resource ) , "UTF-8" ) ;
} catch ( Exception e ) {
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , e . getMessage ( ) ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
} finally {
if ( resource ! = null ) try { resource . close ( ) ; } catch ( Exception e ) { /* ignore this */ }
resource . close ( ) ;
} catch ( Exception ex ) {
/* ignore this */
}
content = content . replaceAll ( "<" , "<" )
. replaceAll ( ">" , ">" )
. replaceAll ( "\"" , """ )
. replaceAll ( "\n" , "<br>" )
. replaceAll ( "\t" , " " ) ;
prop . put ( "error" , 0 ) ;
prop . put ( "viewMode" , VIEW_MODE_AS_PLAIN_TEXT ) ;
prop . put ( "viewMode_plainText" , content ) ;
} else if ( viewMode . equals ( "iframe" ) ) {
prop. put ( "viewMode" , VIEW_MODE_AS_IFRAME ) ;
prop . put ( "viewMode_url" , comp . url ( ) . toNormalform ( ) ) ;
} else if ( viewMode . equals ( "parsed" ) | | viewMode . equals ( "sentences" ) ) {
// parsing the resource content
p lasmaParserDocument document = null ;
try {
document = sb . snippetCache . parseDocument ( comp . url ( ) , resourceLength , resource , resInfo ) ;
if ( document = = null ) {
prop . put ( "error" , 5 ) ;
prop. put ( "error_errorText" , "Unknown error" ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , e . getMessage ( ) ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
if ( viewMode . equals ( "plain" ) ) {
// TODO: how to handle very large files here ?
String content ;
try {
content = new String ( serverFileUtils . read ( resource ) , "UTF-8" ) ;
} catch ( Exception e ) {
prop . put ( "error" , 4 ) ;
prop . put ( "error_errorText" , e . getMessage ( ) ) ;
p rop. put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
} finally {
if ( resource ! = null )
try {
resource. close ( ) ;
} catch ( Exception e ) {
/* ignore this */
}
} catch ( ParserException e ) {
prop . put ( "error" , 5 ) ;
prop . put ( "error_errorText" , e . getMessage ( ) ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
} finally {
if ( resource ! = null ) try { resource . close ( ) ; } catch ( Exception e ) { /* ignore this */ }
}
content = content . replaceAll ( "<" , "<" ) . replaceAll ( ">" , ">" )
. replaceAll ( "\"" , """ ) . replaceAll ( "\n" , "<br>" )
. replaceAll ( "\t" , " " ) ;
prop . put ( "error" , 0 ) ;
prop . put ( "viewMode" , VIEW_MODE_AS_PLAIN_TEXT ) ;
prop . put ( "viewMode_plainText" , content ) ;
} else if ( viewMode . equals ( "iframe" ) ) {
prop . put ( "viewMode" , VIEW_MODE_AS_IFRAME ) ;
prop . put ( "viewMode_url" , url . toNormalform ( ) ) ;
} else if ( viewMode . equals ( "parsed" ) | | viewMode . equals ( "sentences" ) ) {
// parsing the resource content
plasmaParserDocument document = null ;
try {
document = sb . snippetCache . parseDocument ( url , resourceLength , resource , resInfo ) ;
if ( document = = null ) {
prop . put ( "error" , 5 ) ;
prop . put ( "error_errorText" , "Unknown error" ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
}
resMime = document . getMimeType ( ) ;
if ( viewMode . equals ( "parsed" ) ) {
String content = new String ( document . getTextBytes ( ) ) ;
content = wikiCode . replaceHTML ( content ) ; //added by Marc Nause
content = content . replaceAll ( "\n" , "<br>" )
. replaceAll ( "\t" , " " ) ;
prop . put ( "viewMode" , VIEW_MODE_AS_PARSED_TEXT ) ;
prop . put ( "viewMode_parsedText" , content ) ;
} else {
prop . put ( "viewMode" , VIEW_MODE_AS_PARSED_SENTENCES ) ;
final Enumeration sentences = document . getSentences ( null ) ; // FIXME: apply correct charset
boolean dark = true ;
int i = 0 ;
if ( sentences ! = null ) while ( sentences . hasMoreElements ( ) ) {
String currentSentence = wikiCode . replaceHTML ( ( String ) sentences . nextElement ( ) ) ;
} catch ( ParserException e ) {
prop . put ( "error" , 5 ) ;
prop . put ( "error_errorText" , e . getMessage ( ) ) ;
prop . put ( "viewMode" , VIEW_MODE_NO_TEXT ) ;
return prop ;
} finally {
if ( resource ! = null )
try {
resource . close ( ) ;
} catch ( Exception e ) {
/* ignore this */
}
}
resMime = document . getMimeType ( ) ;
if ( viewMode . equals ( "parsed" ) ) {
String content = new String ( document . getTextBytes ( ) ) ;
content = wikiCode . replaceHTML ( content ) ; // added by Marc Nause
content = content . replaceAll ( "\n" , "<br>" ) . replaceAll ( "\t" , " " ) ;
prop . put ( "viewMode" , VIEW_MODE_AS_PARSED_TEXT ) ;
prop . put ( "viewMode_parsedText" , content ) ;
} else {
prop . put ( "viewMode" , VIEW_MODE_AS_PARSED_SENTENCES ) ;
final Enumeration sentences = document . getSentences ( null ) ; // FIXME: apply correct charset
boolean dark = true ;
int i = 0 ;
if ( sentences ! = null )
while ( sentences . hasMoreElements ( ) ) {
String currentSentence = wikiCode
. replaceHTML ( ( String ) sentences . nextElement ( ) ) ;
// Search word highlighting
String words = post . get ( "words" , null ) ;
String words = post . get ( "words" , null ) ;
if ( words ! = null ) {
try {
words = URLDecoder . decode ( words , "UTF-8" ) ;
} catch ( UnsupportedEncodingException e ) { }
String [ ] wordArray = words . substring ( 1 , words . length ( ) - 1 ) . split ( "," ) ;
for ( int j = 0 ; j < wordArray . length ; j + + ) {
String currentWord = wordArray [ j ] . trim ( ) ;
currentSentence = currentSentence . replaceAll ( currentWord ,
"<b style=\"color: black; background-color: rgb(" + highlightingColors [ j % 6 ] + ");\">" + currentWord + "</b>" ) ;
words = URLDecoder . decode ( words , "UTF-8" ) ;
} catch ( UnsupportedEncodingException e ) {
}
String [ ] wordArray = words . substring ( 1 ,
words . length ( ) - 1 ) . split ( "," ) ;
for ( int j = 0 ; j < wordArray . length ; j + + ) {
String currentWord = wordArray [ j ] . trim ( ) ;
currentSentence = currentSentence . replaceAll (
currentWord ,
"<b style=\"color: black; background-color: rgb("
+ highlightingColors [ j % 6 ]
+ ");\">" + currentWord
+ "</b>" ) ;
}
}
prop . put ( "viewMode_sentences_" + i + "_nr" , Integer . toString ( i + 1 ) ) ;
prop . put ( "viewMode_sentences_" + i + "_text" , currentSentence ) ;
prop . put ( "viewMode_sentences_" + i + "_dark" , ( ( dark ) ? 1 : 0 ) ) ; dark = ! dark ;
prop . put ( "viewMode_sentences_" + i + "_nr" , Integer . toString ( i + 1 ) ) ;
prop . put ( "viewMode_sentences_" + i + "_text" , currentSentence ) ;
prop . put ( "viewMode_sentences_" + i + "_dark" , ( ( dark ) ? 1 : 0 ) ) ;
dark = ! dark ;
i + + ;
}
prop . put ( "viewMode_sentences" , i ) ;
prop . put ( "viewMode_sentences" , i ) ;
}
if ( document ! = null ) document . close ( ) ;
}
prop . put ( "error" , 0 ) ;
prop . put ( "error_url" , comp . url ( ) . toNormalform ( ) ) ;
prop . put ( "error_hash" , urlHash ) ;
prop . put ( "error_wordCount" , Integer . toString ( urlEntry . wordCount ( ) ) ) ;
prop . put ( "error_desc" , comp . descr ( ) ) ;
prop . put ( "error_size" , urlEntry . size ( ) ) ;
prop . put ( "error_mimeType" , resMime ) ;
return prop ;
if ( document ! = null ) document . close ( ) ;
}
prop . put ( "error" , 0 ) ;
prop . put ( "error_url" , url . toNormalform ( ) ) ;
prop . put ( "error_hash" , urlHash ) ;
prop . put ( "error_wordCount" , Integer . toString ( wordCount ) ) ;
prop . put ( "error_desc" , descr ) ;
prop . put ( "error_size" , size ) ;
prop . put ( "error_mimeType" , resMime ) ;
return prop ;
}
}