From 85cc6e7800498d954a4c1c2595621cea409f3ac3 Mon Sep 17 00:00:00 2001 From: wzllby <441428735@qq.com> Date: Thu, 6 Mar 2025 01:56:09 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=AF=86=E5=88=AB?= =?UTF-8?q?=E5=8F=91=E7=A5=A8=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contributors/wzllby/file/.gitkeep | 0 .../motor_vehicle_invoice_identification.py | 163 ++++++++++++++++++ contributors/wzllby/result/.gitkeep | 0 contributors/wzllby/utils.py | 71 ++++++++ 4 files changed, 234 insertions(+) create mode 100644 contributors/wzllby/file/.gitkeep create mode 100644 contributors/wzllby/motor_vehicle_invoice_identification.py create mode 100644 contributors/wzllby/result/.gitkeep create mode 100644 contributors/wzllby/utils.py diff --git a/contributors/wzllby/file/.gitkeep b/contributors/wzllby/file/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/contributors/wzllby/motor_vehicle_invoice_identification.py b/contributors/wzllby/motor_vehicle_invoice_identification.py new file mode 100644 index 0000000..c2a3d4b --- /dev/null +++ b/contributors/wzllby/motor_vehicle_invoice_identification.py @@ -0,0 +1,163 @@ +import os.path + +from tencentcloud.common import credential +from tencentcloud.common.profile.client_profile import ClientProfile +from tencentcloud.common.profile.http_profile import HttpProfile +from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException +from tencentcloud.ocr.v20181119 import ocr_client, models +from utils import * + +import json +import pandas as pd + + +class MotorVehicleInvoiceIdentificaion(): + + def __init__(self): + self.api_key = "ocr.tencentcloudapi.com" + + def get_tencent_secret(self, csv_path): + df = pd.read_csv(csv_path) + secret_id = df["SecretId"][0] + secret_key = df["SecretKey"][0] + return [secret_id, secret_key] + + def get_img_info(self, img_path, secret_id, secret_key): + try: + # 密钥可前往官网控制台 https://console.cloud.tencent.com/cam/capi 进行获取 + cred = credential.Credential(secret_id, secret_key) + httpProfile = HttpProfile() + httpProfile.endpoint = self.api_key + clientProfile = ClientProfile() + clientProfile.httpProfile = httpProfile + client = ocr_client.OcrClient(cred, "", clientProfile) + req = models.RecognizeGeneralInvoiceRequest() + encode_str = get_base64_by_img(img_path) + params = { + "ImageUrl": "", + "Types": [12], + "ImageBase64": encode_str + } + req.from_json_string(json.dumps(params)) + resp = client.RecognizeGeneralInvoice(req) + if resp.MixedInvoiceItems[0].Code == 'OK': + motor_vehicle_sale_invoice = resp.MixedInvoiceItems[0].SingleInvoiceInfos.MotorVehicleSaleInvoice + + result = { + "购买方名称": motor_vehicle_sale_invoice.Buyer, + "发票号码": motor_vehicle_sale_invoice.Number, + "发票代码": motor_vehicle_sale_invoice.Code, + "纳税人识别号/统一社会信用代码/身份证号码": motor_vehicle_sale_invoice.BuyerTaxID, + "车辆类型": motor_vehicle_sale_invoice.VehicleType, + "厂牌型号": motor_vehicle_sale_invoice.VehicleModel, + "产地": motor_vehicle_sale_invoice.Origin, + "合格证号": motor_vehicle_sale_invoice.CertificateNumber, + "发动机号码": motor_vehicle_sale_invoice.VehicleEngineCode, + "车辆识别代号/车架号码": motor_vehicle_sale_invoice.VIN, + "价税合计": motor_vehicle_sale_invoice.TotalCn, + "价税合计(小写)": motor_vehicle_sale_invoice.Total, + "销货单位名称": motor_vehicle_sale_invoice.SellerAddress, + "电话": motor_vehicle_sale_invoice.SellerTel, + "纳税人识别号": motor_vehicle_sale_invoice.SellerTaxID, + "账号": motor_vehicle_sale_invoice.SellerBankAccount, + "地址": motor_vehicle_sale_invoice.SellerAddress, + "开户银行": motor_vehicle_sale_invoice.SellerBank, + "增值税税率或征收率": motor_vehicle_sale_invoice.TaxRate, + "增值税税额": motor_vehicle_sale_invoice.Tax, + "主管税务机关及代码": motor_vehicle_sale_invoice.TaxAuthorities + motor_vehicle_sale_invoice.TaxAuthoritiesCode, + "不含税价": motor_vehicle_sale_invoice.PretaxAmount, + "开票日期": motor_vehicle_sale_invoice.Date, + "开票人": motor_vehicle_sale_invoice.Issuer, + "备注": motor_vehicle_sale_invoice.Remark + } + return result + return None + except TencentCloudSDKException as err: + print(err) + + def copy_rename_invoices(self, source_dir, target_dir='./result/new_dir'): + """ + 遍历目录,查找机动车发票下的新车发票文件,复制并重命名 + :param source_dir: 源目录路径 + :param target_dir: 目标目录路径 + :return: 新文件路径列表 + """ + # 创建目标目录(如果不存在) + if os.path.exists(target_dir): + shutil.rmtree(target_dir) + os.makedirs(target_dir, exist_ok=True) + + # 初始化计数器和结果列表 + counter = 1 + new_file_paths = [] + + # 遍历所有目录和文件 + for root, dirs, files in os.walk(source_dir): + # 检查是否存在"机动车发票"目录 + if "机动车发票" in dirs: + motor_dir = os.path.join(root, "机动车发票") + + # 遍历该目录下的所有文件 + for filename in os.listdir(motor_dir): + file_path = os.path.join(motor_dir, filename) + + # 检查是否为文件且包含"新车发票" + if os.path.isfile(file_path) and "新车发票" in filename: + # 分离文件名和扩展名 + _, ext = os.path.splitext(filename) + + # 生成新文件名(新车发票_数字.扩展名) + new_filename = f"新车发票_{counter}{ext}" + new_filepath = os.path.join(target_dir, new_filename) + + # 复制文件并保留元数据 + try: + shutil.copy2(file_path, new_filepath) + new_file_paths.append(new_filepath) + counter += 1 + except Exception as e: + print(f"复制文件 {file_path} 失败: {str(e)}") + return new_file_paths + + def main_exec(self, origin_zip_file, result_excel_file): + if not os.path.exists(origin_zip_file): + print("需要解压的zip文件不存在!") + return + # 删除上次生成的excel文件 + if os.path.exists(result_excel_file): + os.remove(result_excel_file) + process_file_path = None + try: + # 获取解压后的文件路径 + process_file_path = process_zip(origin_zip_file) + if not process_file_path: + print("没有解压文件!") + return + # 参数为秘钥文件,在官网控制台 https://console.cloud.tencent.com/cam/capi 点击新建密钥,然后点击下载CSV文件,不需要可以注释 + secrets = self.get_tencent_secret("SecretKey.csv") + if secrets is None or len(secrets) != 2: + print("获取密钥有误,请检查密钥文件!") + return + # 获取所有的发票并且移动到新的文件中 + new_invoice_path = self.copy_rename_invoices(process_file_path) + result = [] + count = 1 + for new_path in new_invoice_path: + print(f"正在识别第{count}张文件!") + count += 1 + # 解析文件获取信息 第二个和第三个参数可直接在网站复制 + result_info = self.get_img_info(new_path, secrets[0], secrets[1]) + if result_info is not None: + result.append(result_info) + df = pd.DataFrame(result) + # 写入excel + df.to_excel(result_excel_file, index=False) + finally: + if process_file_path: + shutil.rmtree(process_file_path) + + +if __name__ == '__main__': + mon = MotorVehicleInvoiceIdentificaion() + # 第一个参数为zip文件的路径 第二个文件为生成的excel的文件路径 + mon.main_exec("file/样例.zip", "./result/output_result.xlsx") diff --git a/contributors/wzllby/result/.gitkeep b/contributors/wzllby/result/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/contributors/wzllby/utils.py b/contributors/wzllby/utils.py new file mode 100644 index 0000000..ab12f26 --- /dev/null +++ b/contributors/wzllby/utils.py @@ -0,0 +1,71 @@ +""" + 此类为通用工具类 +""" + +import zipfile +import base64 +import os +import shutil +import pandas as pd + + +def process_zip(zip_path, target_path='./file'): + """ + 解压zip文件,获取解压后的文件路径 + + Args: + zip_path: 需要解压的zip文件 + target_path: 解压后的文件 默认是当前文件 + """ + # 创建临时文件夹 + temp_dir = "./temp" + os.makedirs(temp_dir, exist_ok=True) + # 解压zip文件 + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + for file in zip_ref.namelist(): + # 处理乱码 + fixed_name = file.encode('cp437').decode('gbk') + fixed_path = os.path.join(temp_dir, fixed_name) + if file.endswith('/'): + os.makedirs(fixed_path, exist_ok=True) + else: + with open(fixed_path, 'wb') as f: + f.write(zip_ref.read(file)) + items = os.listdir(temp_dir) + if len(items) != 1 or not os.path.isdir(os.path.join(temp_dir, items[0])): + raise ValueError("zip文件格式有误") + # 获取唯一的文件夹名称 + single_folder = items[0] + folder_path = os.path.join(temp_dir, single_folder) + + # 移动到目标目录 + target_folder = os.path.join(target_path, single_folder) + shutil.move(folder_path, target_folder) + + # 清理临时目录 + os.rmdir(temp_dir) + return target_folder + + +def get_base64_by_img(img_path): + """ + 根据文件路径获取base64编码 + + Args: + img_path: 图片路径 + """ + with open(img_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + +def get_tencent_secret(csv_path): + """ + 获取密钥信息 + + Args: + csv_path: 密钥csv信息csv文件 + """ + df = pd.read_csv(csv_path) + secret_id = df["SecretId"][0] + secret_key = df["SecretKey"][0] + return [secret_id, secret_key] From 10f13fc1b18f1234c21d15009b3aa23030e6608a Mon Sep 17 00:00:00 2001 From: wzllby <441428735@qq.com> Date: Thu, 6 Mar 2025 22:01:27 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=AF=86=E5=88=AB?= =?UTF-8?q?=E5=8F=91=E7=A5=A8=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../motor_vehicle_invoice_identification.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/contributors/wzllby/motor_vehicle_invoice_identification.py b/contributors/wzllby/motor_vehicle_invoice_identification.py index c2a3d4b..316af24 100644 --- a/contributors/wzllby/motor_vehicle_invoice_identification.py +++ b/contributors/wzllby/motor_vehicle_invoice_identification.py @@ -156,8 +156,29 @@ def main_exec(self, origin_zip_file, result_excel_file): if process_file_path: shutil.rmtree(process_file_path) + def main_exec_single(self, origin_zip_file, result_excel_file): + if not os.path.exists(origin_zip_file): + print("需要识别的文件不存在!") + return + # 删除上次生成的excel文件 + if os.path.exists(result_excel_file): + os.remove(result_excel_file) + # 参数为秘钥文件,在官网控制台 https://console.cloud.tencent.com/cam/capi 点击新建密钥,然后点击下载CSV文件,不需要可以注释 + secrets = self.get_tencent_secret("SecretKey.csv") + if secrets is None or len(secrets) != 2: + print("获取密钥有误,请检查密钥文件!") + return + # 解析文件获取信息 第二个和第三个参数可直接在网站复制 + result_info = self.get_img_info(origin_zip_file, secrets[0], secrets[1]) + df = pd.DataFrame([result_info]) + # 写入excel + df.to_excel(result_excel_file, index=False) + if __name__ == '__main__': + base_dir = os.path.dirname(os.path.abspath(__file__)) + os.chdir(base_dir) mon = MotorVehicleInvoiceIdentificaion() # 第一个参数为zip文件的路径 第二个文件为生成的excel的文件路径 mon.main_exec("file/样例.zip", "./result/output_result.xlsx") + mon.main_exec_single("file/样例.jpg", "./result/output.xlsx")