diff --git a/script.subtitles.zimukux/addon.xml b/script.subtitles.zimukux/addon.xml index 49803dd9b1..9fb951b2bf 100644 --- a/script.subtitles.zimukux/addon.xml +++ b/script.subtitles.zimukux/addon.xml @@ -1,5 +1,5 @@  - + @@ -20,8 +20,15 @@ resources/fanart.png -v0.2.1(2023/12/11) -- Merged https://github.com/pizzamx/zimuku_for_kodi/pull/23(thanks @jiangpengcheng for solving the cookie issue) +v0.3(2025/06/30) +- Hardcoded OCR mechanism, credits to @realaboo( and behind-the-scene Gemini Code Assist) + +v0.2c(2024/10/28) +- Add new OCR vender (Tencent), ref: https://console.cloud.tencent.com/ocr/overview + +v0.2b(2024/4/6) +- Need to resolve Captcha issue again +- Do change OCR service URL to your own, ref: https://cloud.baidu.com/doc/OCR/s/dk3iqnq51 v0.2.0(2023/5/30) - Merged https://github.com/pizzamx/zimuku_for_kodi/pull/18 (big thanks to lm317379829 for solving the captcha issue) diff --git a/script.subtitles.zimukux/resources/language/resource.language.en_GB/strings.po b/script.subtitles.zimukux/resources/language/resource.language.en_GB/strings.po index 1d191b1c47..8646eb193e 100644 --- a/script.subtitles.zimukux/resources/language/resource.language.en_GB/strings.po +++ b/script.subtitles.zimukux/resources/language/resource.language.en_GB/strings.po @@ -9,10 +9,6 @@ msgctxt "#30101" msgid "Site URL" msgstr "" -msgctxt "#301011" -msgid "OCR URL" -msgstr "OCR API" - msgctxt "#30200" msgid "Sub preference" msgstr "" diff --git a/script.subtitles.zimukux/resources/language/resource.language.zh_CN/strings.po b/script.subtitles.zimukux/resources/language/resource.language.zh_CN/strings.po index 46f9184fd1..9c9765737f 100644 --- a/script.subtitles.zimukux/resources/language/resource.language.zh_CN/strings.po +++ b/script.subtitles.zimukux/resources/language/resource.language.zh_CN/strings.po @@ -9,10 +9,6 @@ msgctxt "#30101" msgid "Site URL" msgstr "网址" -msgctxt "#301011" -msgid "OCR URL" -msgstr "OCR API 的地址(请勿随意修改)" - msgctxt "#30200" msgid "Sub preference" msgstr "字幕下载偏好" diff --git a/script.subtitles.zimukux/resources/lib/ocr.py b/script.subtitles.zimukux/resources/lib/ocr.py new file mode 100644 index 0000000000..fc7ce7b564 --- /dev/null +++ b/script.subtitles.zimukux/resources/lib/ocr.py @@ -0,0 +1,207 @@ +# -*- coding: utf-8 -*- +""" +Subtitle add-on for Kodi 19+ derived from https://github.com/taxigps/xbmc-addons-chinese/tree/master/service.subtitles.zimuku +Copyright (C) <2021> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" + +""" +A simple OCR script to recognize 5 digits from a Base64 encoded BMP image, +using only standard Python libraries. + +This script is designed to work in restricted environments like Kodi add-ons. +""" +import base64 +import struct +import sys +from typing import Tuple, List + + +class BmpOcr: + """ + Performs OCR on a specific 100x27 24-bit BMP image of 5 digits. + + The method uses template matching based on a few sample pixels + for each character. + """ + # Image properties + IMG_WIDTH = 100 + IMG_HEIGHT = 27 + CHAR_WIDTH = 20 + NUM_CHARS = 5 + + # BMP header constants + PIXEL_DATA_OFFSET = 54 # For 24-bit BMP without a color palette + + # OCR sampling points relative to the top-left of a 20x27 character box. + # These points are chosen to effectively distinguish between digits 0-9. + # (x, y) + SAMPLE_POINTS = [ + (10, 7), # P0: Top-center + (7, 8), # P1: Top-left + (12, 8), # P2: Top-right + (10, 13), # P3: Center + (7, 19), # P4: Bottom-left + (12, 19), # P5: Bottom-right + (10, 20), # P6: Bottom-center + (6, 13), # P7: Middle-left + (14, 13) # P8: Middle-right + ] + + # Pre-defined feature vectors for digits 0-9. + # 1 represents a foreground (dark) pixel, 0 represents a background (light) pixel. + # The vectors for '1', '2', '6', '7', '8' are derived from the sample image. + # The rest are crafted based on a standard digital font shape. + DIGIT_TEMPLATES = { + '0': [1, 1, 1, 1, 1, 1, 1, 1, 0],# + '1': [0, 1, 0, 0, 0, 0, 1, 0, 0],# + '2': [1, 0, 1, 0, 1, 0, 1, 0, 0],# + '3': [1, 0, 1, 1, 0, 1, 1, 0, 0],# + '4': [0, 0, 1, 0, 0, 1, 0, 0, 0],# + '5': [1, 1, 0, 0, 0, 1, 1, 0, 0],# + '6': [1, 0, 1, 1, 1, 1, 1, 1, 0],# + '7': [1, 0, 1, 0, 0, 0, 0, 0, 0],# + '8': [1, 1, 1, 1, 1, 1, 1, 0, 0],# + '9': [1, 1, 1, 0, 1, 0, 1, 0, 0],# + } + + def __init__(self, b64_string: str): + """ + Initializes the OCR with a Base64 encoded BMP string. + """ + try: + self.image_data = base64.b64decode(b64_string) + except (ValueError, TypeError): + raise ValueError("Invalid Base64 string provided.") + + # Basic validation of the BMP header + if len(self.image_data) < self.PIXEL_DATA_OFFSET or self.image_data[0:2] != b'BM': + raise ValueError("Data is not a valid BMP.") + + width = struct.unpack_from(' Tuple[int, int, int]: + """ + Gets the (B, G, R) color tuple for a pixel at (x, y). + Handles the bottom-up row order of BMP files. + """ + # BMP rows are stored bottom-up + bmp_y = self.IMG_HEIGHT - 1 - y + offset = self.PIXEL_DATA_OFFSET + (bmp_y * self.row_stride) + (x * 3) + b, g, r = self.image_data[offset:offset + 3] + return b, g, r + + def _is_foreground(self, x: int, y: int, threshold: int = 70) -> bool: + """ + Determines if a pixel is part of the foreground (a digit). + The digits are dark gray/black, background is light gray. + """ + b, g, r = self._get_pixel(x, y) + # Use average brightness to determine if it's foreground + return (r + g + b) / 3 < threshold + + def _match_digit(self, feature_vector: List[int]) -> str: + """ + Finds the best matching digit for a given feature vector. + It calculates the Hamming distance between the input vector and each template. + """ + min_diff = float('inf') + found_digit = '?' + + for digit_char, template_vector in self.DIGIT_TEMPLATES.items(): + # Calculate Hamming distance (number of differing bits) + diff = sum(v1 != v2 for v1, v2 in zip(feature_vector, template_vector)) + + if diff < min_diff: + min_diff = diff + found_digit = digit_char + + # Perfect match, no need to check further + if min_diff == 0: + break + + return found_digit + + def recognize(self) -> str: + """ + Recognizes all 5 digits in the image and returns them as a string. + """ + result = [] + one_offset = 0 + for i in range(self.NUM_CHARS): + char_x_offset = i * self.CHAR_WIDTH + + # Generate the feature vector for the current character + feature_vector = [ + 1 if self._is_foreground(char_x_offset + px - one_offset, py) else 0 + for px, py in self.SAMPLE_POINTS + ] + print(feature_vector) + + # Find the best match for the vector + recognized_char = self._match_digit(feature_vector) + if recognized_char == '1': + one_offset += 1 + elif recognized_char == '4': + one_offset -= 1 + + result.append(recognized_char) + + return "".join(result) + + +def main(): + """ + Main function to run the OCR process. + """ + + # parse file path from first argument + if len(sys.argv) < 2: + b64_file_path = 'base64.txt' + else: + b64_file_path = sys.argv[1] + + try: + with open(b64_file_path, 'r') as f: + b64_data = f.read().strip() + except FileNotFoundError: + print(f"Error: The file '{b64_file_path}' was not found.", file=sys.stderr) + sys.exit(1) + + if not b64_data: + print(f"Error: The file '{b64_file_path}' is empty.", file=sys.stderr) + sys.exit(1) + + try: + ocr = BmpOcr(b64_data) + result = ocr.recognize() + print(f"Input {b64_file_path}") + print(f"Recognized digits: {result}") + except ValueError as e: + print(f"An error occurred: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/script.subtitles.zimukux/resources/lib/sub_provider_service.py b/script.subtitles.zimukux/resources/lib/sub_provider_service.py index 5673382ddc..7ab18fe214 100644 --- a/script.subtitles.zimukux/resources/lib/sub_provider_service.py +++ b/script.subtitles.zimukux/resources/lib/sub_provider_service.py @@ -229,11 +229,10 @@ def run(): else __addon__.getSetting("proxy_server")) os.environ["HTTP_PROXY"] = os.environ["HTTPS_PROXY"] = proxy - ocrUrl= __addon__.getSetting("ocr_url") # 查询 agent = zmkagnt.Zimuku_Agent(zimuku_base_url, __temp__, logger, Unpacker(), - {'subtype': tpe, 'sublang': lang}, ocrUrl) + {'subtype': tpe, 'sublang': lang}) handle_params(params) xbmcplugin.endOfDirectory(int(sys.argv[1])) diff --git a/script.subtitles.zimukux/resources/lib/zimuku_agent.py b/script.subtitles.zimukux/resources/lib/zimuku_agent.py index 9e37e37dab..4d6bcf6f74 100644 --- a/script.subtitles.zimukux/resources/lib/zimuku_agent.py +++ b/script.subtitles.zimukux/resources/lib/zimuku_agent.py @@ -21,15 +21,14 @@ import os import sys import time -import json -import base64 import urllib import requests from bs4 import BeautifulSoup +from ocr import BmpOcr class Zimuku_Agent: - def __init__(self, base_url, dl_location, logger, unpacker, settings, ocrUrl=''): + def __init__(self, base_url, dl_location, logger, unpacker, settings): self.ua = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)' self.ZIMUKU_BASE = base_url # self.ZIMUKU_API = '%s/search?q=%%s&vertoken=%%s' % base_url @@ -43,7 +42,6 @@ def __init__(self, base_url, dl_location, logger, unpacker, settings, ocrUrl='') self.plugin_settings = settings self.session = requests.Session() self.vertoken = '' - self.ocrUrl = ocrUrl # 一次性调用,获取必需的cookies,验证机制可能之后会变 self.init_site() @@ -92,6 +90,9 @@ def get_page(self, url, **kwargs): s.get(url, headers=request_headers) http_response = s.get(url, headers=request_headers) """ + if 'class="verifyimg"' in str(http_response.content): + self.verify(url) + http_response = s.get(url, headers=request_headers) headers = http_response.headers http_body = http_response.content @@ -101,38 +102,30 @@ def get_page(self, url, **kwargs): return headers, http_body - def verify(self, url, append): + def verify(self, url): headers = None http_body = None - s = self.session + session = self.session try: request_headers = {'User-Agent': self.ua} a = requests.adapters.HTTPAdapter(max_retries=3) - s.mount('https://', a) + session.mount('https://', a) self.logger.log(sys._getframe().f_code.co_name, '[CHALLENGE VERI-CODE] requests GET [%s]' % (url), level=3) - http_response = s.get(url, headers=request_headers) + http_response = session.get(url, headers=request_headers) if http_response.status_code != 200: soup = BeautifulSoup(http_response.content, 'html.parser') - content = soup.find_all(attrs={'class': 'verifyimg'})[ + imgSrc = soup.find_all(attrs={'class': 'verifyimg'})[ 0].get('src') - if content is not None: + if imgSrc is not None: + base64 = imgSrc.split('data:image/bmp;base64,')[1] # 处理编码 - ocrurl = self.ocrUrl - payload = {'imgdata': content} - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36' - } - response = requests.request( - "POST", ocrurl, headers=headers, json=payload) - result_json = json.loads(response.text) - text = '' - if result_json['code'] == 1: - text = result_json['result'] + ocr = BmpOcr(base64) + text = ocr.recognize() str1 = '' i = 0 for ch in text: @@ -141,13 +134,12 @@ def verify(self, url, append): else: str1 += hex(ord(text[i])) i = i + 1 - # 使用带验证码的访问 + sep_char = '&' if '?' in url else '?' get_cookie_url = '%s%s&%s' % ( - url, append, 'security_verify_img=' + str1.replace('0x', '')) - http_response = s.get( + url, sep_char, 'security_verify_img=' + str1.replace('0x', '')) + http_response = session.get( get_cookie_url, headers=request_headers) - a = 1 except Exception as e: self.logger.log(sys._getframe().f_code.co_name, @@ -288,13 +280,13 @@ def search(self, title, items): # self.get_page(get_cookie_url) # 处理验证码逻辑 - # self.verify(url, '&chost=zimuku.org') + #self.verify(url, '&chost=zimuku.org') # 真正的搜索 self.logger.log(sys._getframe().f_code.co_name, "Search API url: %s" % (url)) - url += '&chost=zimuku.org' + #url += '&chost=zimuku.org' _, data = self.get_page(url) soup = BeautifulSoup(data, 'html.parser') except Exception as e: @@ -302,8 +294,10 @@ def search(self, title, items): (Exception, e), level=3) return [] - s_e = 'S%02dE%02d' % (int(items['season']), int(items['episode']) - ) if items['season'] != '' and items['episode'] != '' else 'N/A' + s_e = s_e_CN = 'N/A' + if items['season'] != '' and items['episode'] != '': + s_e = 'S%02dE%02d' % (int(items['season']), int(items['episode'])) + s_e_CN = '第%d季第%d集' % (int(items['season']), int(items['episode'])) if s_e != 'N/A': # 1. 从搜索结果中看看是否能直接找到 sub_list = soup.find_all('tr') @@ -311,7 +305,7 @@ def search(self, title, items): s_e, [ep.a.text for ep in sub_list])) for sub in reversed(sub_list): sub_name = sub.a.text - if s_e in sub_name.upper(): + if s_e in sub_name.upper() or s_e_CN in sub_name: subtitle_list.append(self.extract_sub_info(sub, 1)) # break 还是全列出来吧 @@ -362,8 +356,8 @@ def search(self, title, items): subtitle = self.extract_sub_info(sub, 2) unfiltered_sub_list.append(subtitle) sub_name = sub.a.text - if s_e in sub_name.upper(): - subtitle_list.append(subtitle) + if s_e in sub_name.upper() or s_e_CN in sub_name: + subtitle_list.append(self.extract_sub_info(sub, 2)) # 如果匹配到了季,那就得返回了,没有就是没有 # 如果没有匹配到,可能整季度的字幕被打包到一个文件中了,那就把所有的结果都返回让用户自己选择 if len(subtitle_list) > 0: @@ -511,7 +505,7 @@ def download(self, url): ".gz", ".xz", ".iso", ".tgz", ".tbz2", ".cbr") try: # 处理验证码逻辑 - # self.verify(url, '?') + #self.verify(url, '?') # Subtitle detail page. headers, data = self.get_page(url) @@ -522,7 +516,7 @@ def download(self, url): url = urllib.parse.urljoin(self.ZIMUKU_BASE, url) # 处理验证码逻辑 - # self.verify(url, '?') + #self.verify(url, '&chost=zimuku.org') # Subtitle download-list page. headers, data = self.get_page(url) @@ -641,7 +635,7 @@ def download_links(self, links, referer): "DOWNLOAD SUBTITLE: %s" % (url)) # 处理验证码逻辑 - # self.verify(url, '?') + #self.verify(url, '?') # Download subtitle one by one until success. headers, data = self.get_page(url, Referer=referer) diff --git a/script.subtitles.zimukux/resources/settings.xml b/script.subtitles.zimukux/resources/settings.xml index bc64d3ad85..214d6f8e32 100644 --- a/script.subtitles.zimukux/resources/settings.xml +++ b/script.subtitles.zimukux/resources/settings.xml @@ -13,16 +13,6 @@ false - - - 0 - - 301011 - - - - false -