出差时经常有打车费需要报销,到了填报销单的时候就很头疼。我用的是高德打车,他会将发票和行程分开后,多个行程开一张发票。

最后积攒多了数量会很多,如下图所示,文件数量看上去还好,但是具体到行程单会更多

公司要求一个行程一个行记录,所以每次填报销单都需要把行程找出来,然后再找这个行程对应的发票,多个行程可能对应一张发票,需要把发票号、税率等填写到对应的行程上,可以看到下图有42张行程,再加上填写区域比较狭窄,操作也不是很方便,所以填单就比较困难

所以我就想写个python脚本帮我自动识别下发票信息和行程信息,起码让我省去多个行程对应发票找发票号的过程,
这个脚本的思路就是根据pdf的后缀判断是行程单还是发票,然后分开处理,如果是行程单,就解析行程单中的多个行程,如果是发票,先优先查询一下有没有同名的xml后缀的文件,如果有就解析xml来获取发票号,如果没有就尝试解析Pdf来获取发票号,最终会讲结果保存到excel文件中,同时生成json文件(生成json是为了谷歌浏览器插件自动填单)
import pdfplumber
import re
import os
from pathlib import Path
import xml.etree.ElementTree as ET
# import xlwt
import openpyxl
def re_text(bt, text):
m1 = re.search(bt, text)
if m1:
return re_block(m1.group(1))
return ""
def re_block(text):
return re.sub(r"[ )):]", "", text)
# 读取发票
def read_invoice(workspace):
filenames = [file.name for file in os.scandir(workspace) if file.name.endswith(".pdf")]
row = 1
result = {}
for filename in filenames:
try:
file_path = os.path.join(workspace, filename)
if '发票' not in filename:
continue
filename_stem = Path(filename).stem
xml_file_path = file_path.replace(".pdf",".xml")
if os.path.exists(xml_file_path):
print("优先解析:%s.xml"%(filename_stem))
tree = ET.parse(xml_file_path)
root = tree.getroot()
invoice_number = root.find('.//TaxSupervisionInfo/InvoiceNumber').text
IssueTime = root.find('.//TaxSupervisionInfo/IssueTime').text
taxRate = root.find('.//IssuItemInformation/TaxRate').text
TotalTax_includedAmount = root.find('.//BasicInformation/TotalTax-includedAmount').text
data = {
"发票代码": '',
"发票号码": invoice_number,
"开票日期": IssueTime,
"校验码": '',
"公司": '',
"金额": TotalTax_includedAmount,
"税额": '',
"税率": taxRate
}
result[filename_stem] = (data)
else:
print("没找到【%s.xml】文件,继续解析【%s.pdf】"%(filename_stem, filename_stem))
with pdfplumber.open(file_path) as pdf:
first_page = pdf.pages[0]
pdf_text = first_page.extract_text()
# print(pdf_text)
# 获取发票信息
fapiaodaima = re_text(r'发票代码.*?(\d+)', pdf_text)
fapiaohaoma = re_text(r'发票号码.*?(\d+)', pdf_text)
kaipiaoriqi = re_text(r'开票日期(.*)', pdf_text)
jiaoyan = re_text(r'校 验 码\s*[::]\s*([a-zA-Z0-9 ]+)', pdf_text)
heji_match = re_text(r'合.*计(.*)', pdf_text)
# heji = re.split(r"¥|¥", heji_match) if heji_match else ["", ""]
heji = re.split(r"¥|¥", heji_match) if heji_match else ["", ""] #re.split(r" ", heji_match) #
jine = heji[1] if len(heji) > 1 else "".join(heji)
shuie = heji[2] if len(heji) > 2 else "".join(heji)
# 提取公司名称
company_match = re.findall(r'名.*?称\s*[::]\s*([一-龟]+)', pdf_text)
gongsi = re_block(company_match[-1]) if company_match else ""
data = {
"发票代码": fapiaodaima,
"发票号码": fapiaohaoma,
"开票日期": kaipiaoriqi,
"校验码": jiaoyan,
"公司": gongsi,
"金额": jine,
"税额": shuie,
"税率": '',
}
result[filename_stem] = (data)
except Exception as exc:
print(f"Error processing {filename}: {exc}")
return result
# 读取行程单
def read_trip(workspace):
filenames = [file.name for file in os.scandir(workspace) if file.name.endswith(".pdf")]
row = 1
result = []
for filename in filenames:
if '行程单' not in filename:
continue
filename_stem = Path(filename).stem
try:
with pdfplumber.open(os.path.join(workspace, filename)) as pdf:
first_page = pdf.pages[0]
pdf_tables = first_page.extract_tables()
print(filename)
for table in pdf_tables:
for row in table[1:]:
row_array = []
if '\n' in row[0]:
tmp = row[0].split('\n')
tmp1 = tmp[1].split(' ')
row_array = tmp1[0:7]
row_array.append(tmp[0]+tmp[2])
row_array.append(tmp1[7])
# print(row_array)
else:
row_array = re.split(r'\s+', row[0])
if row_array:
result_row = {
"日期": row_array[3],
"城市": row_array[5],
"起点": row_array[6],
"终点": row_array[7],
"金额": row_array[8],
"发票文件": filename_stem.replace("电子行程单", "电子发票")
}
result.append(result_row)
except Exception as exc:
print(f"Error processing {filename}: {exc}")
return result
def write(lst):
# 创建工作簿
wb = openpyxl.Workbook()
# 获取活动的工作表
ws = wb.active
# 设置工作表的标题
ws.title = 'Sheet 1'
for i, name in zip(range(12), ['发票代码', '发票号码', '开票日期', "校验码", '公司', '金额', '税额', '税率', '城市', '起点', '终点', '日期','发票文件']):
ws.cell(row=1, column=i+1, value=name)
# 写入Excel
for i, row_data in enumerate(lst, start=2):
for j, cell_value in enumerate(row_data, start=1):
ws.cell(row=i, column=j, value=cell_value)
wb.save('行程信息.xlsx')
workspace = '/home/gaojunxin/图片/发票'
invoice_info = read_invoice(workspace)
print("获取发票信息如下:\n %s"%(invoice_info))
# 行程信息获取
trip_info = read_trip(workspace)
print("行程信息获取如下:\n %s"%(trip_info))
array = []
for trip_item in trip_info:
filename_stem = trip_item['发票文件']
invoice_item = invoice_info[filename_stem]
item = [
invoice_item['发票代码'],
invoice_item['发票号码'],
invoice_item['开票日期'],
invoice_item['校验码'],
invoice_item['公司'],
trip_item['金额'],
invoice_item['税额'],
invoice_item['税率'],
trip_item['城市'],
trip_item['起点'],
trip_item['终点'],
trip_item['日期'],
filename_stem
]
array.append(item)
write(array)
执行日志
❯ python3 invoice.py
优先解析:及时用车-42.39元-4个行程高德打车电子发票.xml
优先解析:及时用车-24.40元-2个行程高德打车电子发票.xml
优先解析:及时用车-29.03元-1个行程高德打车电子发票.xml
优先解析:喜行约车-47.00元-4个行程高德打车电子发票.xml
没找到【添猫出行-24.91元-2个行程高德打车电子发票.xml】文件,继续解析【添猫出行-24.91元-2个行程高德打车电子发票.pdf】
优先解析:高德打车-12.12元-1个行程高德打车电子发票.xml
优先解析:365约车-21.05元-1个行程高德打车电子发票.xml
优先解析:神州专车-12.21元-1个行程高德打车电子发票.xml
优先解析:天津出行-25.96元-2个行程高德打车电子发票.xml
优先解析:神州专车-19.39元-1个行程高德打车电子发票.xml
优先解析:曹操出行-20.86元-2个行程高德打车电子发票.xml
优先解析:风韵出行-11.20元-1个行程高德打车电子发票.xml
没找到【鞍马出行-57.17元-4个行程高德打车电子发票.xml】文件,继续解析【鞍马出行-57.17元-4个行程高德打车电子发票.pdf】
优先解析:妥妥E行-11.49元-1个行程高德打车电子发票.xml
获取发票信息如下:
{'及时用车-42.39元-4个行程高德打车电子发票': {'发票代码': '', '发票号码': '24372000000253697200', '开票日期': '2024-11-26', '校验码': '', '公司': '', '金额': '42.39', '税额': '', '税率': '0.03'}, ...}
天津出行-25.96元-2个行程高德打车电子行程单.pdf
神州专车-19.39元-1个行程高德打车电子行程单.pdf
风韵出行-11.20元-1个行程高德打车电子行程单.pdf
曹操出行-20.86元-2个行程高德打车电子行程单.pdf
鞍马出行-57.17元-4个行程高德打车电子行程单.pdf
喜行约车-47.00元-4个行程高德打车电子行程单.pdf
及时用车-42.39元-4个行程高德打车电子行程单.pdf
及时用车-24.40元-2个行程高德打车电子行程单.pdf
及时用车-29.03元-1个行程高德打车电子行程单.pdf
添猫出行-24.91元-2个行程高德打车电子行程单.pdf
高德打车-12.12元-1个行程高德打车电子行程单.pdf
妥妥E行-11.49元-1个行程高德打车电子行程单.pdf
神州专车-12.21元-1个行程高德打车电子行程单.pdf
365约车-21.05元-1个行程高德打车电子行程单.pdf
行程信息获取如下:
[{'日期': '2024-10-08', '城市': '天津市', '起点': '起点xx', '终点': '终点xx', '金额': '11.49元', '发票文件': '天津出行-25.96元-2个行程高德打车电子发票'}, {'日期': '2024-10-27', '城市': '天津市', '起点': 'xxx(东进站口)西侧', '终点': '起点xx', '金额': '14.47元', '发票文件': '天津出行-25.96元-2个行程高德打车电子发票'}, ...]
生成的excel文件,可以基于此结果再做调整,会比手工处理方便很多
基于上面的操作,数据是整理出来了,但是感觉一个一个点击新增,再去输入信息也比较费劲,干脆一步到位再做个小扩展,开发了个谷歌浏览器插件来根据json自动填单
根据excel导出json的代码
import requests
import json
import os
import openpyxl
lst = []
workbook = openpyxl.load_workbook("行程信息.xlsx")
sheet = workbook.active
data = []
for row in sheet.iter_rows(min_row=2, values_only=True):
city = row[8]
trip_date = row[11].replace("-", "")
invoiceNumber = row[1]
invoiceCode = row[0]
amount = row[5]
tax = row[6]
note = row[9]+'到'+row[10]
item = {
"发票类型": "2040",
"日期": trip_date,
"地点": city,
"金额": str(amount),
"发票号码": invoiceNumber,
"发票代码": invoiceCode,
"税率编号": "01",
"备注": note,
}
data.append(item)
with open('data.json', "w") as fp:
json.dump(data, fp, ensure_ascii=False)
整体代码放在github上了:github,整个代码比较粗糙,就是为了年底填单量大,临时解决问题用的,没有太多的优化和润色,仅供参考