Support parsing audio URLs without file extension

Added also a Junit for the audio tag parser
6 years ago · e90405b6f0
parent 42c8a251c8
commit e90405b6f0
10 changed files with 197 additions and 32 deletions
--- a/source/net/yacy/document/parser/audioTagParser.java
+++ b/source/net/yacy/document/parser/audioTagParser.java
@ -63,9 +63,9 @@ import net.yacy.kelondro.util.FileUtils;
 public class audioTagParser extends AbstractParser implements Parser {
 	
 	/**
-	 * Enumeration of internet media types supported by the {@link audioTagParser}.
+	 * Enumeration of audio formats supported by the {@link audioTagParser}.
 	 */
-	public enum SupportedAudioMediaType {
+	public enum SupportedAudioFormat {

 		AIF("audio/aiff", new String[] { "audio/x-aiff" }, new String[] { SupportedFileFormat.AIF.getFilesuffix(),
 				SupportedFileFormat.AIFC.getFilesuffix(), SupportedFileFormat.AIFF.getFilesuffix() }),
@ -118,7 +118,7 @@ public class audioTagParser extends AbstractParser implements Parser {
 		 * @param mediaType the media type, formatted as "type/subtype"
 		 * @param fileExtensions a set of file extensions matching the given media type
 		 */
-		private SupportedAudioMediaType(final String mediaType, final String[] fileExtensions) {
+		private SupportedAudioFormat(final String mediaType, final String[] fileExtensions) {
 			this(mediaType, new String[] {}, fileExtensions);
 		}

@ -127,7 +127,7 @@ public class audioTagParser extends AbstractParser implements Parser {
 		 * @param alternateMediaTypes alternate flavors the the main media type, all formatted as "type/subtype"
 		 * @param fileExtensions a set of file extensions matching the given media type
 		 */
-		private SupportedAudioMediaType(final String mediaType, final String[] alternateMediaTypes, final String[] fileExtensions) {
+		private SupportedAudioFormat(final String mediaType, final String[] alternateMediaTypes, final String[] fileExtensions) {
 			this.mediaType = mediaType.toLowerCase(Locale.ROOT);
 			Set<String> alternates = new HashSet<>();
 			for (final String alternateMediaType : alternateMediaTypes) {
@ -179,7 +179,7 @@ public class audioTagParser extends AbstractParser implements Parser {
 		 */
 		public static Set<String> getAllMediaTypes() {
 			final Set<String> mediaTypes = new HashSet<>();
-			for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) {
+			for(final SupportedAudioFormat mediaType : SupportedAudioFormat.values()) {
 				mediaTypes.add(mediaType.getMediaType());
 				for(final String mediaTypeString : mediaType.getAlternateMediaTypes()) {
 					mediaTypes.add(mediaTypeString);	
@ -193,15 +193,18 @@ public class audioTagParser extends AbstractParser implements Parser {
 		 */
 		public static Set<String> getAllFileExtensions() {
 			final Set<String> extensions = new HashSet<>();
-			for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) {
+			for(final SupportedAudioFormat mediaType : SupportedAudioFormat.values()) {
 				extensions.addAll(mediaType.getFileExtensions());
 			}
 			return extensions;
 		}
 	}
 	
-	/** Map from each supported audio file extensions to a single audio media type */
-	private final Map<String, SupportedAudioMediaType> ext2NormalMediaType;
+	/** Map from each supported audio file extensions to audio format */
+	private final Map<String, SupportedAudioFormat> ext2Format;
+	
+	/** Map from each supported audio media type to audio format */
+	private final Map<String, SupportedAudioFormat> mediaType2Format;
 	
 	/** Space character */
 	private static final char SPACE_CHAR = ' ';
@ -217,18 +220,25 @@ public class audioTagParser extends AbstractParser implements Parser {
    public audioTagParser() {
        super("Audio File Meta-Tag Parser");
        
-        final Map<String, SupportedAudioMediaType> normalMap = new HashMap<>();
+        final Map<String, SupportedAudioFormat> ext2Formats = new HashMap<>();
+        
+        final Map<String, SupportedAudioFormat> mediaType2Formats = new HashMap<>();
        
-        for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) {
+        for(final SupportedAudioFormat mediaType : SupportedAudioFormat.values()) {
        	this.SUPPORTED_MIME_TYPES.add(mediaType.getMediaType());
        	this.SUPPORTED_MIME_TYPES.addAll(mediaType.getAlternateMediaTypes());
        	this.SUPPORTED_EXTENSIONS.addAll(mediaType.getFileExtensions());
        	for(final String fileExtension : mediaType.getFileExtensions()) {
-        		normalMap.put(fileExtension, mediaType);
+        		ext2Formats.put(fileExtension, mediaType);
+        	}
+        	mediaType2Formats.put(mediaType.getMediaType(), mediaType);
+        	for(final String mediaTypeStr : mediaType.getAlternateMediaTypes()) {
+        		mediaType2Formats.put(mediaTypeStr, mediaType);
        	}
        }
        
-        this.ext2NormalMediaType = Collections.unmodifiableMap(normalMap);
+        this.ext2Format = Collections.unmodifiableMap(ext2Formats);
+        this.mediaType2Format = Collections.unmodifiableMap(mediaType2Formats);
    }

    @Override
@ -246,9 +256,34 @@ public class audioTagParser extends AbstractParser implements Parser {
    @Override
    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
    		final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
-    		throws UnsupportedOperationException, Failure, InterruptedException {
+    		throws Failure, InterruptedException {
        String filename = location.getFileName();
-        final String fileext = MultiProtocolURL.getFileExtension(filename);
+        String fileExt = MultiProtocolURL.getFileExtension(filename);
+        
+
+        SupportedAudioFormat audioFormat = null;
+        if(fileExt != null) {
+        	audioFormat = this.ext2Format.get(fileExt);
+        }
+        if(audioFormat == null) {
+        	audioFormat =  this.mediaType2Format.get(mimeType);
+        }
+        
+        String normalizedMediaType = mimeType;
+    	if(audioFormat != null) {
+        	/* normalize to a single Media Type. Advantages : 
+        	 * - index document with the right media type when HTTP response header "Content-Type" is missing or has a wrong value
+        	 * - for easier search by CollectionSchema.content_type in the index
+             */
+    		normalizedMediaType = audioFormat.getMediaType();
+    		
+            if(fileExt.isEmpty() || !ext2Format.containsKey(fileExt)) {
+            	/* Normalize extension to a one known by jaudiotagger */
+            	fileExt = audioFormat.getFileExtensions().iterator().next();
+            }
+    	}
+        
+        
        filename = filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename);
   	    
    	File tempFile = null;
@ -260,7 +295,7 @@ public class audioTagParser extends AbstractParser implements Parser {
        		f = AudioFileIO.read(location.getFSFile());
        	} else {
            	// create a temporary file, as jaudiotagger requires a file rather than an input stream 
-        		tempFile = File.createTempFile(filename, "." + fileext);
+        		tempFile = File.createTempFile(filename, "." + fileExt);
        		long bytesCopied = FileUtils.copy(source, tempFile, maxBytes);
        		partiallyParsed = bytesCopied == maxBytes && source.read() != -1;
                f = AudioFileIO.read(tempFile);
@ -316,21 +351,9 @@ public class audioTagParser extends AbstractParser implements Parser {
 				detectedUrls = Collections.emptySet();
 			}
            
-        	/* normalize to a single Media Type. Advantages : 
-        	 * - index document with the right media type when HTTP response header "Content-Type" is missing or has a wrong value
-        	 * - for easier search by CollectionSchema.content_type in the index
-             */
-            String mime = mimeType;
-            if(fileext != null && !fileext.isEmpty() ) {
-            	final SupportedAudioMediaType mediaType = this.ext2NormalMediaType.get(fileext);
-            	if(mediaType != null) {
-            		mime = mediaType.getMediaType();
-            	}
-            }
-
            final Document doc = new Document(
                    location,
-                    mime,
+                    normalizedMediaType,
                    charset,
                    this,
                    lang, // languages
--- a/source/net/yacy/migration.java
+++ b/source/net/yacy/migration.java
@ -441,11 +441,11 @@ public class migration {
 			 * All old audio file extensions and media types are denied : we add newly
 			 * supported ones to theses deny lists
 			 */
-			deniedExtensions.addAll(audioTagParser.SupportedAudioMediaType.getAllFileExtensions());
+			deniedExtensions.addAll(audioTagParser.SupportedAudioFormat.getAllFileExtensions());
 			
 			sb.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, deniedExtensions);
 			
-			deniedMediaTypes.addAll(audioTagParser.SupportedAudioMediaType.getAllMediaTypes());
+			deniedMediaTypes.addAll(audioTagParser.SupportedAudioFormat.getAllMediaTypes());
 			
 			sb.setConfig(SwitchboardConstants.PARSER_MIME_DENY, deniedMediaTypes);
 			
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -887,8 +887,8 @@ public final class Switchboard extends serverSwitch {
    	
    	/* audioTagParser is disabled by default as it needs a temporary file (because of the JAudiotagger implementation) for each parsed document */
    	if (!enableAudioTags) {
-			denyExt.addAll(audioTagParser.SupportedAudioMediaType.getAllFileExtensions());
-			denyMime.addAll(audioTagParser.SupportedAudioMediaType.getAllMediaTypes());
+			denyExt.addAll(audioTagParser.SupportedAudioFormat.getAllFileExtensions());
+			denyMime.addAll(audioTagParser.SupportedAudioFormat.getAllMediaTypes());
        	
        	setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, denyExt);
        	setConfig(SwitchboardConstants.PARSER_MIME_DENY, denyMime);
--- a/test/java/net/yacy/document/parser/audioTagParserTest.java
+++ b/test/java/net/yacy/document/parser/audioTagParserTest.java
@ -0,0 +1,142 @@
+// audioTagParserTest.java
+// ---------------------------
+// Copyright 2019 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Collection;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.document.Document;
+import net.yacy.document.Parser.Failure;
+import net.yacy.document.VocabularyScraper;
+
+/**
+ * Unit tests for the {@link audioTagParser} class
+ * 
+ */
+public class audioTagParserTest {
+
+	/** Folder containing test files */
+	private static final File TEST_FOLDER = new File("test", "parsertest");
+
+	/** The parser under test */
+	private audioTagParser parser;
+
+	@Before
+	public void before() {
+		this.parser = new audioTagParser();
+	}
+
+	/**
+	 * Unit test for the
+	 * {@link audioTagParser#parse(DigestURL, String, String, VocabularyScraper, int, java.io.InputStream)}
+	 * function with some small (1 second length) test files.
+	 * 
+	 * @throws Failure              when a file could not be parsed
+	 * @throws InterruptedException when the test was interrupted before its
+	 *                              termination
+	 * @throws IOException          when a read/write error occurred
+	 */
+	@Test
+	public void testParse() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_windows.aiff", "umlaute_windows.flac", "umlaute_windows.m4a",
+				"umlaute_windows.mp3", "umlaute_windows.ogg", "umlaute_windows.wav" };
+
+		for (final String fileName : fileNames) {
+			final DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));) {
+				final Document[] documents = this.parser.parse(location, "audio/ogg", StandardCharsets.UTF_8.name(),
+						new VocabularyScraper(), 0, inStream);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				final Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
+				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+				assertTrue(anchors.iterator().next().toString().equals("https://yacy.net/"));
+			}
+		}
+	}
+
+	/**
+	 * Test support for parsing audio document with proper Media Type but without
+	 * extension or unrelated extension in its file name.
+	 * 
+	 * @throws Failure              when the file could not be parsed
+	 * @throws InterruptedException when the test was interrupted before its
+	 *                              termination
+	 * @throws IOException          when a read/write error occurred
+	 */
+	@Test
+	public void testParseDocUrlWithoutFileExt() throws Failure, InterruptedException, IOException {
+		final String testFileName = "umlaute_windows.ogg";
+		final String[] locations = { "http://localhost/audioTrack", "http://localhost/example.audio" };
+
+		for (final String locationStr : locations) {
+			final DigestURL location = new DigestURL(locationStr);
+			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, testFileName));) {
+				final Document[] documents = this.parser.parse(location, "audio/ogg", StandardCharsets.UTF_8.name(),
+						new VocabularyScraper(), 0, inStream);
+				assertNotNull("Parser result must not be null for URL " + location, documents);
+			}
+		}
+
+	}
+
+	/**
+	 * Test support for parsing audio document with unknown or generic Media Type
+	 * 
+	 * @throws Failure              when the file could not be parsed
+	 * @throws InterruptedException when the test was interrupted before its
+	 *                              termination
+	 * @throws IOException          when a read/write error occurred
+	 */
+	@Test
+	public void testParseUnkownMediaType() throws Failure, InterruptedException, IOException {
+		final String testFileName = "umlaute_windows.ogg";
+		final DigestURL location = new DigestURL("http://localhost/" + testFileName);
+		final String[] mediaTypes = { null, "application/octet-stream" };
+
+		for (final String mediaType : mediaTypes) {
+			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, testFileName));) {
+				final Document[] documents = this.parser.parse(location, mediaType, StandardCharsets.UTF_8.name(),
+						new VocabularyScraper(), 0, inStream);
+				assertNotNull("Parser result must not be null for Media Type " + mediaType, documents);
+			}
+		}
+
+	}
+
+}
--- a/test/parsertest/umlaute_windows.aiff
+++ b/test/parsertest/umlaute_windows.aiff
--- a/test/parsertest/umlaute_windows.flac
+++ b/test/parsertest/umlaute_windows.flac
--- a/test/parsertest/umlaute_windows.m4a
+++ b/test/parsertest/umlaute_windows.m4a
--- a/test/parsertest/umlaute_windows.mp3
+++ b/test/parsertest/umlaute_windows.mp3
--- a/test/parsertest/umlaute_windows.ogg
+++ b/test/parsertest/umlaute_windows.ogg
--- a/test/parsertest/umlaute_windows.wav
+++ b/test/parsertest/umlaute_windows.wav