commit 40aac6203b867dd188754b57f288b120c866e4f1 Author: Victor Date: Wed Apr 14 23:23:56 2021 +0300 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5d62c19 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +input +library +logs +images.db \ No newline at end of file diff --git a/database.py b/database.py new file mode 100644 index 0000000..f44b4d5 --- /dev/null +++ b/database.py @@ -0,0 +1,38 @@ +from datetime import datetime +import sqlite3 +import logging + +class Database: + def __init__(self): + self.db_name = 'images.db' + self.__create_tables() + + def __create_tables(self): + conn = sqlite3.connect(self.db_name) + c = conn.cursor() + c.executescript(""" + CREATE TABLE IF NOT EXISTS danbooru ( + id INTEGER PRIMARY KEY NOT NULL UNIQUE, + tags TEXT NOT NULL, + created_at TIMESTAMP + ); + """) + conn.commit() + conn.close() + + def is_exists(self, id) -> bool: + conn = sqlite3.connect(self.db_name) + c = conn.cursor() + c.execute("SELECT EXISTS(SELECT 1 FROM danbooru WHERE id=?)", (id, )) + result = c.fetchone()[0] + conn.close() + return bool(result) + + def add(self, id, tags): + conn = sqlite3.connect(self.db_name) + c = conn.cursor() + sql = 'INSERT INTO danbooru(id, tags, created_at) VALUES (?,?,?)' + c.execute(sql, (id, tags, datetime.now())) + conn.commit() + conn.close() + \ No newline at end of file diff --git a/iqdb.py b/iqdb.py new file mode 100644 index 0000000..7b553c4 --- /dev/null +++ b/iqdb.py @@ -0,0 +1,17 @@ +from bs4 import BeautifulSoup +import logging +import requests + +class Iqdb: + def search(self, file): + logging.info('Searching %s', file) + files = {'file': open(file, 'rb')} + resp = requests.post('https://iqdb.org/', files=files, timeout=10) + doc = BeautifulSoup(resp.text, 'html.parser') + for tag in doc.select(".image a"): + url = tag.get("href") + if "danbooru.donmai.us/posts" in url: + if url.startswith("//"): + url = "https:" + url + return url + return None diff --git a/library.py b/library.py new file mode 100644 index 0000000..d2068c9 --- /dev/null +++ b/library.py @@ -0,0 +1,54 @@ +from tags import Tags +from pathlib import Path +import shutil +import logging +import os + +class Library: + def __init__(self, dir_root: Path): + self.dir_root = dir_root + self.dir_orphan = Path(dir_root, 'orphan') + self.dir_orphan.mkdir(exist_ok=True, parents=True) + + def move_to_orphan(self, p: Path): + logging.info("%s move to orphan", p) + shutil.move(os.fspath(p), os.fspath(self.dir_orphan)) + + def move(self, p: Path, tags: Tags): + new_path = self.__compute_path(tags) + new_path.mkdir(exist_ok=True, parents=True) + logging.info("%s move to %s", p.name, new_path) + shutil.move(os.fspath(p), os.fspath(new_path)) + + def __compute_path(self, tags: Tags) -> Path: + p = self.dir_root + if tags.copyrights == 'original': + # Originals groups by artist + p = p / "_originals" + if tags.artists != "": + artist = tags.artists.split(" ")[0] + p = p / self.__sanitize(artist) + return p + # Main section + copyright = "" + if tags.copyrights != "": + copyright = tags.copyrights.split(" ")[0] + p = p / self.__sanitize(copyright) + if tags.characters == "": + return p + # Characters section + characters = tags.characters.split(" ") + if len(characters) == 1: + character = characters[0] \ + .replace(copyright, "") \ + .replace("("+copyright+")", "") \ + .replace("()", "") \ + .strip() + p = p / self.__sanitize(character) + else: + p = p / "_multiple" + return p + + def __sanitize(self, s: str) -> str: + s = "".join(x for x in s if x.isalnum() or x in "._-()") + return s.replace("_", " ").strip() diff --git a/metadata.py b/metadata.py new file mode 100644 index 0000000..cdb516b --- /dev/null +++ b/metadata.py @@ -0,0 +1,98 @@ +from typing import Optional, Union +from tags import Tags +from pathlib import Path +from datetime import datetime +import fluentpy as _ +import logging +import json +import re +import requests +import subprocess + +class Metadata: + def __init__(self, dir_tmp: Path): + self.dir_tmp = dir_tmp + self.tmp_image_file = Path(self.dir_tmp, "tmp.jpg") + + def process(self, url: str) -> Optional[Union[Path, Tags]]: + logging.info("Retrieving metadata for %s", url) + meta = self.__get_metadata(url) + status = self.__download_file(meta) + if not status: + logging.warn("Download failed") + return None + return self.__write_tags(url, meta) + + def __get_metadata(self, url: str) -> dict: + return requests.get(url + ".json").json() + # with open('test/test.json', 'rt', encoding='utf8') as f: + # r = json.load(f) + # return r + + def __download_file(self, r: dict) -> bool: + ext = r.get("file_ext", "") + w = int(r.get("image_width", "0")) + h = int(r.get("image_height", "0")) + if (ext not in ["jpg", "jpeg", "png", "webp"]) or w == 0 or h == 0: + return False + file_url = r.get("file_url") + file_size_kb = int(r.get('file_size', "0")) / 1024 + + logging.info("Downloading image") + recompress = self.__need_recompress(ext, w, h, file_size_kb) + return self.__download(file_url, recompress=recompress) + + def __need_recompress(self, ext, w, h, size_kb): + return ext == 'jpg' and size_kb > 1400 and w < 2500 and h < 2500 + + def __download(self, img_url: str, recompress: bool = False): + opt_args = [] + if recompress: + opt_args = ['-quality', 80] + ret = subprocess.call([ + 'magick', img_url, + '-resize', '2500x2500>', + *opt_args, self.tmp_image_file + ], stdout=subprocess.PIPE) + return ret == 0 + + + def __write_tags(self, url: str, r: dict) -> tuple: + tag_general = r.get('tag_string_general', "") + tag_copyrights = r.get('tag_string_copyright', "") + tag_characters = r.get('tag_string_character', "") + tag_artists = r.get('tag_string_artist', "") + tags = Tags(tag_general, tag_copyrights, tag_characters, tag_artists) + + tags_file = Path(self.dir_tmp, "tags.txt") + with open(tags_file, "w") as f: + content = _(tags.tags) \ + .map(lambda s: "-IPTC:keywords=" + s) \ + .join("\n") \ + ._ + content += "\n-Exif:ImageDescription=" + url + content += "\n-Iptc:Caption-Abstract=" + url + content += "\n-Xmp:Description=" + url + f.write(content) + + logging.info("Writing tags") + subprocess.call([ + 'exiftool', '-q', '-overwrite_original', + '-@', tags_file, + self.tmp_image_file + ], stdout=subprocess.PIPE) + + filename = self.__format_filename(tags) + result_file = Path(self.tmp_image_file.parent, filename) + self.tmp_image_file.rename(result_file) + return result_file, tags + + + def __format_filename(self, tags: Tags): + filename = '{} {} by {} at {}.jpg'.format( + tags.copyrights.split(" ")[0] or "", + ", ".join(tags.characters.split(" ")[:2]), + tags.artists.split(" ")[0] or "", + datetime.now().strftime('%Y%m%d_%H%M%S') + ) + return re.sub(r'\s+', ' ', filename).strip() diff --git a/picsorter.py b/picsorter.py new file mode 100644 index 0000000..5071fa1 --- /dev/null +++ b/picsorter.py @@ -0,0 +1,68 @@ +from iqdb import Iqdb +from library import Library +from metadata import Metadata +from database import Database + +from datetime import datetime +from pathlib import Path +import re +import logging +import time + +class PicSorter: + def __init__(self): + self.dir_tmp = Path('R:/') + self.dir_input = Path('./input') + self.dir_logs = Path('./logs') + self.dir_library = Path('./library') + self.setup_folders() + self.setup_logging() + + def setup_folders(self): + self.dir_tmp.mkdir(exist_ok=True) + self.dir_logs.mkdir(exist_ok=True) + self.dir_library.mkdir(exist_ok=True) + + def setup_logging(self): + logfile = Path(self.dir_logs, datetime.now().strftime('%Y-%m-%d.log')) + logging.basicConfig( + filename=logfile, + level=logging.INFO, + format='%(asctime)s %(levelname)s %(module)s: %(message)s', + datefmt='%H:%M:%S', + ) + + def process_folder(self): + iqdb = Iqdb() + library = Library(self.dir_library) + metadata = Metadata(self.dir_tmp) + db = Database() + for filename in self.dir_input.rglob('*.jpg'): + print("Process ", filename) + try: + url = iqdb.search(filename) + if url is None: + logging.warn("%s not found", filename) + library.move_to_orphan(Path(filename)) + continue + + m = re.search(r".*posts\/(\d{3,})", url) + if not m: + continue + post_id = int(m.group(1)) + if db.is_exists(post_id): + logging.info("Skipping exists post %d", post_id) + continue + + meta_result = metadata.process(url) + if meta_result is None: + continue + image_path, tags = meta_result + library.move(image_path, tags) + db.add(post_id, tags.tags_string) + time.sleep(5) + except Exception as ex: + raise ex + +if __name__ == '__main__': + PicSorter().process_folder() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8a84da7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +fluentpy==2.0 +PyYAML==5.4.1 +requests==2.24.0 diff --git a/tags.py b/tags.py new file mode 100644 index 0000000..40f3d5e --- /dev/null +++ b/tags.py @@ -0,0 +1,29 @@ +import fluentpy as _ +from dataclasses import dataclass, field + +@dataclass +class Tags: + general: str + copyrights: str + characters: str + artists: str + tags: list = field(init=False) + tags_string: str = field(init=False) + + def __post_init__(self): + self.tags = self.__union_tags() + self.tags_string = " ".join(self.tags) + + def __union_tags(self): + tags = self.general.split(" ") + tags += self.__prefix_tags(self.copyrights, 'copyright_') + tags += self.__prefix_tags(self.characters, 'character_') + tags += self.__prefix_tags(self.artists, 'artist_') + return tags + + def __prefix_tags(self, tags, prefix): + return _(tags) \ + .split(" ") \ + .filter(lambda s: s != "") \ + .map(lambda s: prefix + s.strip()) \ + ._