Add cache of match failures

This change introduces an sqlite database that contains the track_id, db insertion time, and ttl in the cache. The ttl starts with one week, and increases exponentially by a factor of 2 each time the same track_id is added to the database. This significantly reduces the execution of the time script when there are a lot of match failures accumulating, which do not need to check every time.
2024-05-22 13:29:19 +02:00
parent e2236e429e
commit 311822ecdc
4 changed files with 73 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 # Config and cache files
 config.yml
 config.yaml
-.cache-*
+.cache*
 .session.yml
 # Byte-compiled / optimized / DLL files
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,7 @@ dependencies = [
  "tidalapi~=0.7",
  "pyyaml~=6.0",
  "tqdm~=4.64",
  "sqlalchemy~=2.0"
 ]
--- a/src/spotify_to_tidal/database.py
+++ b/src/spotify_to_tidal/database.py
@@ -0,0 +1,63 @@
 import datetime
 import sqlalchemy
 from sqlalchemy import Table, Column, String, DateTime, MetaData, insert, select, update, delete
 class Database:
    def __init__(self, filename='.cache.db'):
        self.engine = sqlalchemy.create_engine(f"sqlite:///{filename}")
        meta = MetaData()
        self.match_failures = Table('match_failures', meta,
                                    Column('track_id', String,
                                           primary_key=True),
                                    Column('insert_time', DateTime),
                                    Column('next_retry', DateTime),
                                    sqlite_autoincrement=False)
        meta.create_all(self.engine)
    def _get_next_retry_time(self, insert_time=None):
        if insert_time:
            # double interval on each retry
            interval = 2 * (datetime.datetime.now() - insert_time)
        else:
            interval = datetime.timedelta(days=7)
        return datetime.datetime.now() + interval
    def cache_match_failure(self, track_id):
        """ notifies that matching failed for the given track_id """
        fetch_statement = select(self.match_failures).where(
            self.match_failures.c.track_id == track_id)
        with self.engine.connect() as connection:
            with connection.begin():
                # Either update the next_retry time if track_id already exists, otherwise create a new entry
                existing_failure = connection.execute(
                    fetch_statement).fetchone()
                if existing_failure:
                    update_statement = update(self.match_failures).where(
                        self.match_failures.c.track_id == track_id).values(next_retry=self._get_next_retry_time())
                    connection.execute(update_statement)
                else:
                    connection.execute(insert(self.match_failures), {
                                       "track_id": track_id, "insert_time": datetime.datetime.now(), "next_retry": self._get_next_retry_time()})
    def has_match_failure(self, track_id):
        """ checks if there was a recent search for which matching failed with the given track_id """
        statement = select(self.match_failures.c.next_retry).where(
            self.match_failures.c.track_id == track_id)
        with self.engine.connect() as connection:
            match_failure = connection.execute(statement).fetchone()
            if match_failure:
                return match_failure.next_retry > datetime.datetime.now()
            return False
    def remove_match_failure(self, track_id):
        """ removes match failure from the database """
        statement = delete(self.match_failures).where(
            self.match_failures.c.track_id == track_id)
        with self.engine.connect() as connection:
            with connection.begin():
                connection.execute(statement)
 # Main singleton instance
 failure_cache = Database()
--- a/src/spotify_to_tidal/sync.py
+++ b/src/spotify_to_tidal/sync.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 from .database import failure_cache
 from functools import partial
 from typing import Sequence, Set, Mapping
 from multiprocessing import Pool
@@ -89,10 +90,12 @@ def match(tidal_track, spotify_track) -> bool:
        and artist_match(tidal_track, spotify_track)
    )
 def tidal_search(spotify_track_and_cache, tidal_session: tidalapi.Session) -> tidalapi.Track | None:
    spotify_track, cached_tidal_track = spotify_track_and_cache
    if cached_tidal_track: return cached_tidal_track
    if spotify_track['id'] is None: return None
    if failure_cache.has_match_failure(spotify_track['id']):
        return None
    # search for album name and first album artist
    if 'album' in spotify_track and 'artists' in spotify_track['album'] and len(spotify_track['album']['artists']):
        album_result = tidal_session.search(simple(spotify_track['album']['name']) + " " + simple(spotify_track['album']['artists'][0]['name']), models=[tidalapi.album.Album])
@@ -101,11 +104,14 @@ def tidal_search(spotify_track_and_cache, tidal_session: tidalapi.Session) -> ti
            if len(album_tracks) >= spotify_track['track_number']:
                track = album_tracks[spotify_track['track_number'] - 1]
                if match(track, spotify_track):
                    failure_cache.remove_match_failure(spotify_track['id'])
                    return track
    # if that fails then search for track name and first artist
    for track in tidal_session.search(simple(spotify_track['name']) + ' ' + simple(spotify_track['artists'][0]['name']), models=[tidalapi.media.Track])['tracks']:
        if match(track, spotify_track):
            failure_cache.remove_match_failure(spotify_track['id'])
            return track
    failure_cache.cache_match_failure(spotify_track['id'])
 def get_tidal_playlists_dict(tidal_session: tidalapi.Session) -> Mapping[str, tidalapi.Playlist]:
    # a dictionary of name --> playlist
@@ -284,4 +290,4 @@ def get_playlists_from_spotify(spotify_session: spotipy.Spotify, config):
 def get_playlists_from_config(config):
    # get the list of playlist sync mappings from the configuration file
-    return [(item['spotify_id'], item['tidal_id']) for item in config['sync_playlists']]
+    return [(item['spotify_id'], item['tidal_id']) for item in config['sync_playlists']]