aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErik Rigtorp <erik@rigtorp.se>2020-05-04 20:19:10 -0700
committerErik Rigtorp <erik@rigtorp.se>2020-05-07 15:47:59 -0700
commit8bd356ab2077b5c1a90510e3e73b11698eca0331 (patch)
tree06710bae6d3a0f7019bbc536a06393127e85e109
parentd016a9772fedfaae303e1a6225c875390d5c26dc (diff)
Reduce number of TMDB lookups if filenames have punctuation chars
Previosly TMDB would be queried with the raw name and always fail, then retry with the cleaned name. Now non-word chars are always cleaned out first. If first query fails, retry with more aggressive cleaning.
-rw-r--r--MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs67
1 files changed, 40 insertions, 27 deletions
diff --git a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs
index 223cef086..08c1afec2 100644
--- a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs
+++ b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs
@@ -5,6 +5,7 @@ using System.Linq;
using System.Net;
using System.Threading;
using System.Threading.Tasks;
+using System.Text.RegularExpressions;
using MediaBrowser.Common.Net;
using MediaBrowser.Controller.Library;
using MediaBrowser.Controller.Providers;
@@ -19,6 +20,20 @@ namespace MediaBrowser.Providers.Tmdb.Movies
public class TmdbSearch
{
private static readonly CultureInfo EnUs = new CultureInfo("en-US");
+
+ private static readonly Regex cleanEnclosed = new Regex(@"\p{Ps}.*\p{Pe}", RegexOptions.Compiled);
+ private static readonly Regex cleanNonWord = new Regex(@"[\W_]+", RegexOptions.Compiled);
+ private static readonly Regex cleanStopWords = new Regex(@"\b( # Start at word boundary
+ 19[0-9]{2}|20[0-9]{2}| # 1900-2099
+ S[0-9]{2}| # Season
+ E[0-9]{2}| # Episode
+ (2160|1080|720|576|480)[ip]?| # Resolution
+ [xh]?264| # Encoding
+ (web|dvd|bd|hdtv|hd)rip| # *Rip
+ web|hdtv|mp4|bluray|ktr|dl|single|imageset|internal|doku|dubbed|retail|xxx|flac
+ ).* # Match rest of string",
+ RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
+
private const string Search3 = TmdbUtils.BaseTmdbApiUrl + @"3/search/{3}?api_key={1}&query={0}&language={2}";
private readonly ILogger _logger;
@@ -61,19 +76,18 @@ namespace MediaBrowser.Providers.Tmdb.Movies
var tmdbImageUrl = tmdbSettings.images.GetImageUrl("original");
- if (!string.IsNullOrWhiteSpace(name))
- {
- var parsedName = _libraryManager.ParseName(name);
- var yearInName = parsedName.Year;
- name = parsedName.Name;
- year = year ?? yearInName;
- }
+ // Does this mean we are reparsing already parsed ItemLookupInfo?
+ var parsedName = _libraryManager.ParseName(name);
+ var yearInName = parsedName.Year;
+ name = parsedName.Name;
+ year = year ?? yearInName;
- _logger.LogInformation("MovieDbProvider: Finding id for item: " + name);
+ _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name, year);
var language = idInfo.MetadataLanguage.ToLowerInvariant();
- //nope - search for it
- //var searchType = item is BoxSet ? "collection" : "movie";
+ // Replace sequences of non-word characters with space
+ // TMDB expects a space separated list of words make sure that is the case
+ name = cleanNonWord.Replace(name, " ").Trim();
var results = await GetSearchResults(name, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false);
@@ -86,36 +100,35 @@ namespace MediaBrowser.Providers.Tmdb.Movies
}
}
+ // Ideally retrying alternatives should be done outside the search
+ // provider so that the retry logic can be common for all search
+ // providers
if (results.Count == 0)
{
- // try with dot and _ turned to space
- var originalName = name;
-
- name = name.Replace(",", " ");
- name = name.Replace(".", " ");
- name = name.Replace("_", " ");
- name = name.Replace("-", " ");
- name = name.Replace("!", " ");
- name = name.Replace("?", " ");
-
- var parenthIndex = name.IndexOf('(');
- if (parenthIndex != -1)
- {
- name = name.Substring(0, parenthIndex);
- }
+ name = parsedName.Name;
+
+ // Remove things enclosed in []{}() etc
+ name = cleanEnclosed.Replace(name, string.Empty);
+ // Replace sequences of non-word characters with space
+ name = cleanNonWord.Replace(name, " ");
+
+ // Clean based on common stop words / tokens
+ name = cleanStopWords.Replace(name, string.Empty);
+
+ // Trim whitespace
name = name.Trim();
// Search again if the new name is different
- if (!string.Equals(name, originalName))
+ if (!string.Equals(name, parsedName.Name) && !string.IsNullOrWhiteSpace(name))
{
+ _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name, year);
results = await GetSearchResults(name, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false);
if (results.Count == 0 && !string.Equals(language, "en", StringComparison.OrdinalIgnoreCase))
{
//one more time, in english
results = await GetSearchResults(name, searchType, year, "en", tmdbImageUrl, cancellationToken).ConfigureAwait(false);
-
}
}
}