py匹配excel的手机号,然后将匹配到的创建时间放到匹配的表中

我爱海鲸 2026-02-28 17:51:20 暂无标签

简介excel、csv、xls、xlsx

# -*- coding: utf-8 -*-
"""
从 需要匹配的表.xlsx 读取脱敏手机号(业务号码列),
在 被匹配的表.xls 中匹配电话号码,取最后一个匹配的创建时间,
写入需要匹配的表的最后一列。
"""

import csv
import re
from pathlib import Path

import openpyxl

# ==================== 配置 ====================
BASE_DIR = Path(__file__).parent
COMMISSION_FILE = BASE_DIR / "需要匹配的表.xlsx"
VPS_FILE = BASE_DIR / "被匹配的表.xls"
# ==============================================


def masked_phone_to_pattern(masked: str) -> str | None:
    """
    将脱敏手机号转为匹配用的正则模式。
    例如: 138****1234 -> 138\\d{4}1234
    """
    if not masked or not isinstance(masked, str):
        return None
    s = str(masked).strip()
    if "*" not in s:
        return None
    pattern = ""
    i = 0
    while i < len(s):
        if s[i] == "*":
            n = 0
            while i < len(s) and s[i] == "*":
                n += 1
                i += 1
            pattern += r"\d{" + str(n) + "}"
        else:
            start = i
            while i < len(s) and s[i] != "*":
                i += 1
            pattern += re.escape(s[start:i])
    return pattern if pattern else None


def normalize_phone(phone) -> str:
    """将电话号码转为字符串,去除空格等"""
    if phone is None:
        return ""
    # Excel 可能将数字读成 13812341234.0
    if isinstance(phone, float) and phone == int(phone):
        phone = int(phone)
    s = str(phone).strip().replace(" ", "").replace("-", "")
    return s


def load_vps_data(vps_path: Path) -> list[dict]:
    """从 被匹配的表.xls 加载数据(支持 .xls 实为 CSV 的情况)"""
    if not vps_path.exists():
        raise FileNotFoundError(f"文件不存在: {vps_path}")

    # 尝试 CSV 编码
    for enc in ("utf-8-sig", "utf-8", "gbk", "gb2312"):
        try:
            with open(vps_path, "r", encoding=enc, newline="") as f:
                reader = csv.reader(f)
                header = next(reader)
                try:
                    phone_col = header.index("电话号码")
                    time_col = header.index("创建时间")
                except ValueError:
                    continue
                rows = []
                for row in reader:
                    if len(row) > max(phone_col, time_col):
                        phone = normalize_phone(row[phone_col])
                        create_time = row[time_col].strip() if row[time_col] else ""
                        rows.append({"phone": phone, "create_time": create_time})
                return rows
        except (UnicodeDecodeError, UnicodeError):
            continue
    raise ValueError("无法解析 vps 文件,请确认编码(utf-8/gbk)")


def find_last_match(phone_pattern: str, vps_rows: list[dict]) -> str | None:
    """
    在 vps 数据中按正则匹配电话号码,返回最后一个匹配的创建时间。
    """
    try:
        pat = re.compile(phone_pattern)
    except re.error:
        return None

    last_create_time = None
    for row in vps_rows:
        if pat.fullmatch(row["phone"]):
            last_create_time = row["create_time"]
    return last_create_time


def format_cell_value(val):
    """将创建时间转为可写入 Excel 的格式"""
    if val is None:
        return ""
    # vps 中创建时间通常已是字符串如 "2025-11-06 16:56:50"
    s = str(val).strip()
    return s if s else ""


def main():
    if not COMMISSION_FILE.exists():
        print(f"错误:文件不存在 {COMMISSION_FILE}")
        return 1
    if not VPS_FILE.exists():
        print(f"错误:文件不存在 {VPS_FILE}")
        return 1

    print("正在加载 被匹配的表.xls ...")
    vps_rows = load_vps_data(VPS_FILE)
    print(f"  vps 共 {len(vps_rows)} 行")

    print("正在加载 需要匹配的表.xlsx ...")
    wb = openpyxl.load_workbook(COMMISSION_FILE)
    ws = wb.active

    # 找业务号码列
    header = [ws.cell(1, c).value for c in range(1, ws.max_column + 1)]
    try:
        biz_col = header.index("业务号码") + 1
    except ValueError:
        print(f"错误:未找到列「业务号码」,表头: {header}")
        return 1

    last_col = ws.max_column
    new_col = last_col + 1
    ws.cell(1, new_col, "创建时间")  # 表头

    matched = 0
    not_matched = 0
    for row in range(2, ws.max_row + 1):
        masked = ws.cell(row, biz_col).value
        pattern = masked_phone_to_pattern(masked)
        if not pattern:
            ws.cell(row, new_col, "")
            not_matched += 1
            continue

        create_time = find_last_match(pattern, vps_rows)
        if create_time is not None:
            val = format_cell_value(create_time)
            ws.cell(row, new_col, val)
            matched += 1
        else:
            ws.cell(row, new_col, "")
            not_matched += 1

    out_path = COMMISSION_FILE
    wb.save(out_path)
    print(f"完成:匹配 {matched} 行,未匹配 {not_matched} 行")
    print(f"已保存到 {out_path},新列「创建时间」在第 {new_col} 列")
    return 0


if __name__ == "__main__":
    exit(main())

你好:我的2025