From 40aac6203b867dd188754b57f288b120c866e4f1 Mon Sep 17 00:00:00 2001
From: Victor <annimon119@gmail.com>
Date: Wed, 14 Apr 2021 23:23:56 +0300
Subject: [PATCH] Initial commit

---
 .gitignore       |  5 +++
 database.py      | 38 +++++++++++++++++++
 iqdb.py          | 17 +++++++++
 library.py       | 54 ++++++++++++++++++++++++++
 metadata.py      | 98 ++++++++++++++++++++++++++++++++++++++++++++++++
 picsorter.py     | 68 +++++++++++++++++++++++++++++++++
 requirements.txt |  3 ++
 tags.py          | 29 ++++++++++++++
 8 files changed, 312 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 database.py
 create mode 100644 iqdb.py
 create mode 100644 library.py
 create mode 100644 metadata.py
 create mode 100644 picsorter.py
 create mode 100644 requirements.txt
 create mode 100644 tags.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5d62c19
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__
+input
+library
+logs
+images.db
\ No newline at end of file
diff --git a/database.py b/database.py
new file mode 100644
index 0000000..f44b4d5
--- /dev/null
+++ b/database.py
@@ -0,0 +1,38 @@
+from datetime import datetime
+import sqlite3
+import logging
+
+class Database:
+    def __init__(self):
+        self.db_name = 'images.db'
+        self.__create_tables()
+
+    def __create_tables(self):
+        conn = sqlite3.connect(self.db_name)
+        c = conn.cursor()
+        c.executescript("""
+            CREATE TABLE IF NOT EXISTS danbooru (
+                id INTEGER PRIMARY KEY NOT NULL UNIQUE,
+                tags TEXT NOT NULL,
+                created_at TIMESTAMP
+            );
+            """)
+        conn.commit()
+        conn.close()
+
+    def is_exists(self, id) -> bool:
+        conn = sqlite3.connect(self.db_name)
+        c = conn.cursor()
+        c.execute("SELECT EXISTS(SELECT 1 FROM danbooru WHERE id=?)", (id, ))
+        result = c.fetchone()[0]
+        conn.close()
+        return bool(result)
+    
+    def add(self, id, tags):
+        conn = sqlite3.connect(self.db_name)
+        c = conn.cursor()
+        sql = 'INSERT INTO danbooru(id, tags, created_at) VALUES (?,?,?)'
+        c.execute(sql, (id, tags, datetime.now()))
+        conn.commit()
+        conn.close()
+    
\ No newline at end of file
diff --git a/iqdb.py b/iqdb.py
new file mode 100644
index 0000000..7b553c4
--- /dev/null
+++ b/iqdb.py
@@ -0,0 +1,17 @@
+from bs4 import BeautifulSoup
+import logging
+import requests
+
+class Iqdb:
+    def search(self, file):
+        logging.info('Searching %s', file)
+        files = {'file': open(file, 'rb')}
+        resp = requests.post('https://iqdb.org/', files=files, timeout=10)
+        doc = BeautifulSoup(resp.text, 'html.parser')
+        for tag in doc.select(".image a"):
+            url = tag.get("href")
+            if "danbooru.donmai.us/posts" in url:
+                if url.startswith("//"):
+                    url = "https:" + url
+                return url
+        return None
diff --git a/library.py b/library.py
new file mode 100644
index 0000000..d2068c9
--- /dev/null
+++ b/library.py
@@ -0,0 +1,54 @@
+from tags import Tags
+from pathlib import Path
+import shutil
+import logging
+import os
+
+class Library:
+    def __init__(self, dir_root: Path):
+        self.dir_root = dir_root
+        self.dir_orphan = Path(dir_root, 'orphan')
+        self.dir_orphan.mkdir(exist_ok=True, parents=True)
+    
+    def move_to_orphan(self, p: Path):
+        logging.info("%s move to orphan", p)
+        shutil.move(os.fspath(p), os.fspath(self.dir_orphan))
+    
+    def move(self, p: Path, tags: Tags):
+        new_path = self.__compute_path(tags)
+        new_path.mkdir(exist_ok=True, parents=True)
+        logging.info("%s move to %s", p.name, new_path)
+        shutil.move(os.fspath(p), os.fspath(new_path))
+
+    def __compute_path(self, tags: Tags) -> Path:
+        p = self.dir_root
+        if tags.copyrights == 'original':
+            # Originals groups by artist
+            p = p / "_originals"
+            if tags.artists != "":
+                artist = tags.artists.split(" ")[0]
+                p = p / self.__sanitize(artist)
+            return p
+        # Main section
+        copyright = ""
+        if tags.copyrights != "":
+            copyright = tags.copyrights.split(" ")[0]
+            p = p / self.__sanitize(copyright)
+        if tags.characters == "":
+            return p
+        # Characters section
+        characters = tags.characters.split(" ")
+        if len(characters) == 1:
+            character = characters[0] \
+                .replace(copyright, "") \
+                .replace("("+copyright+")", "") \
+                .replace("()", "") \
+                .strip()
+            p = p / self.__sanitize(character)
+        else:
+            p = p / "_multiple"
+        return p
+
+    def __sanitize(self, s: str) -> str:
+        s = "".join(x for x in s if x.isalnum() or x in "._-()")
+        return s.replace("_", " ").strip()
diff --git a/metadata.py b/metadata.py
new file mode 100644
index 0000000..cdb516b
--- /dev/null
+++ b/metadata.py
@@ -0,0 +1,98 @@
+from typing import Optional, Union
+from tags import Tags
+from pathlib import Path
+from datetime import datetime
+import fluentpy as _
+import logging
+import json
+import re
+import requests
+import subprocess
+
+class Metadata:
+    def __init__(self, dir_tmp: Path):
+        self.dir_tmp = dir_tmp
+        self.tmp_image_file = Path(self.dir_tmp, "tmp.jpg")
+
+    def process(self, url: str) -> Optional[Union[Path, Tags]]:
+        logging.info("Retrieving metadata for %s", url)
+        meta = self.__get_metadata(url)
+        status = self.__download_file(meta)
+        if not status:
+            logging.warn("Download failed")
+            return None
+        return self.__write_tags(url, meta)
+
+    def __get_metadata(self, url: str) -> dict:
+        return requests.get(url + ".json").json()
+        # with open('test/test.json', 'rt', encoding='utf8') as f:
+            # r = json.load(f)
+        # return r
+
+    def __download_file(self, r: dict) -> bool:
+        ext = r.get("file_ext", "")
+        w = int(r.get("image_width", "0"))
+        h = int(r.get("image_height", "0"))
+        if (ext not in ["jpg", "jpeg", "png", "webp"]) or w == 0 or h == 0:
+            return False
+        file_url = r.get("file_url")
+        file_size_kb = int(r.get('file_size', "0")) / 1024
+
+        logging.info("Downloading image")
+        recompress = self.__need_recompress(ext, w, h, file_size_kb)
+        return self.__download(file_url, recompress=recompress)
+    
+    def __need_recompress(self, ext, w, h, size_kb):
+        return ext == 'jpg' and size_kb > 1400 and w < 2500 and h < 2500
+    
+    def __download(self, img_url: str, recompress: bool = False):
+        opt_args = []
+        if recompress:
+            opt_args = ['-quality', 80]
+        ret = subprocess.call([
+            'magick', img_url,
+            '-resize', '2500x2500>',
+            *opt_args, self.tmp_image_file
+        ], stdout=subprocess.PIPE)
+        return ret == 0
+
+
+    def __write_tags(self, url: str, r: dict) -> tuple:
+        tag_general = r.get('tag_string_general', "")
+        tag_copyrights = r.get('tag_string_copyright', "")
+        tag_characters = r.get('tag_string_character', "")
+        tag_artists = r.get('tag_string_artist', "")
+        tags = Tags(tag_general, tag_copyrights, tag_characters, tag_artists)
+
+        tags_file = Path(self.dir_tmp, "tags.txt")
+        with open(tags_file, "w") as f:
+            content = _(tags.tags) \
+                .map(lambda s: "-IPTC:keywords=" + s) \
+                .join("\n") \
+                ._
+            content += "\n-Exif:ImageDescription=" + url
+            content += "\n-Iptc:Caption-Abstract=" + url
+            content += "\n-Xmp:Description=" + url
+            f.write(content)
+        
+        logging.info("Writing tags")
+        subprocess.call([
+            'exiftool', '-q', '-overwrite_original',
+            '-@', tags_file,
+            self.tmp_image_file
+        ], stdout=subprocess.PIPE)
+
+        filename = self.__format_filename(tags)
+        result_file = Path(self.tmp_image_file.parent, filename)
+        self.tmp_image_file.rename(result_file)
+        return result_file, tags
+
+
+    def __format_filename(self, tags: Tags):
+        filename = '{} {} by {} at {}.jpg'.format(
+            tags.copyrights.split(" ")[0] or "",
+            ", ".join(tags.characters.split(" ")[:2]),
+            tags.artists.split(" ")[0] or "",
+            datetime.now().strftime('%Y%m%d_%H%M%S')
+        )
+        return re.sub(r'\s+', ' ', filename).strip()
diff --git a/picsorter.py b/picsorter.py
new file mode 100644
index 0000000..5071fa1
--- /dev/null
+++ b/picsorter.py
@@ -0,0 +1,68 @@
+from iqdb import Iqdb
+from library import Library
+from metadata import Metadata
+from database import Database
+
+from datetime import datetime
+from pathlib import Path
+import re
+import logging
+import time
+
+class PicSorter:
+    def __init__(self):
+        self.dir_tmp = Path('R:/')
+        self.dir_input = Path('./input')
+        self.dir_logs = Path('./logs')
+        self.dir_library = Path('./library')
+        self.setup_folders()
+        self.setup_logging()
+
+    def setup_folders(self):
+        self.dir_tmp.mkdir(exist_ok=True)
+        self.dir_logs.mkdir(exist_ok=True)
+        self.dir_library.mkdir(exist_ok=True)
+
+    def setup_logging(self):
+        logfile = Path(self.dir_logs, datetime.now().strftime('%Y-%m-%d.log'))
+        logging.basicConfig(
+            filename=logfile,
+            level=logging.INFO,
+            format='%(asctime)s %(levelname)s %(module)s: %(message)s',
+            datefmt='%H:%M:%S',
+        )
+
+    def process_folder(self):
+        iqdb = Iqdb()
+        library = Library(self.dir_library)
+        metadata = Metadata(self.dir_tmp)
+        db = Database()
+        for filename in self.dir_input.rglob('*.jpg'):
+            print("Process ", filename)
+            try:
+                url = iqdb.search(filename)
+                if url is None:
+                    logging.warn("%s not found", filename)
+                    library.move_to_orphan(Path(filename))
+                    continue
+
+                m = re.search(r".*posts\/(\d{3,})", url)
+                if not m:
+                    continue
+                post_id = int(m.group(1))
+                if db.is_exists(post_id):
+                    logging.info("Skipping exists post %d", post_id)
+                    continue
+
+                meta_result = metadata.process(url)
+                if meta_result is None:
+                    continue
+                image_path, tags = meta_result
+                library.move(image_path, tags)
+                db.add(post_id, tags.tags_string)
+                time.sleep(5)
+            except Exception as ex:
+                raise ex
+    
+if __name__ == '__main__':
+    PicSorter().process_folder()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8a84da7
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+fluentpy==2.0
+PyYAML==5.4.1
+requests==2.24.0
diff --git a/tags.py b/tags.py
new file mode 100644
index 0000000..40f3d5e
--- /dev/null
+++ b/tags.py
@@ -0,0 +1,29 @@
+import fluentpy as _
+from dataclasses import dataclass, field
+
+@dataclass
+class Tags:
+    general: str
+    copyrights: str
+    characters: str
+    artists: str
+    tags: list = field(init=False)
+    tags_string: str = field(init=False)
+
+    def __post_init__(self):
+        self.tags = self.__union_tags()
+        self.tags_string = " ".join(self.tags)
+    
+    def __union_tags(self):
+        tags = self.general.split(" ")
+        tags += self.__prefix_tags(self.copyrights, 'copyright_')
+        tags += self.__prefix_tags(self.characters, 'character_')
+        tags += self.__prefix_tags(self.artists, 'artist_')
+        return tags
+
+    def __prefix_tags(self, tags, prefix):
+        return _(tags) \
+                .split(" ") \
+                .filter(lambda s: s != "") \
+                .map(lambda s: prefix + s.strip()) \
+                ._