# -*- coding: utf-8 -*-
"""
诊断 national_sites.db 里 Location 字段的"市"级归属情况

策略：
    1. 从 admin_division.db 读入每个省级行政区的"市级名"与"县级名"清单
    2. 对 national_sites.db 每条记录：
       - 先去掉 Province 前缀得到 tail
       - 若 tail 以本省某市级名（市/州/盟/地区）开头 → 分类 A：已含市
       - 否则尝试用本省所有县级名（长度倒序）匹配 tail 开头：
           - 匹配成功 → 分类 B：命中县，可回填市
           - 匹配失败 → 分类 C：无法自动识别，需人工
    3. 输出三类统计；分类 C 单独写入 CSV 供人工核对

用法：
    python 诊断Location匹配.py

只读，不修改任何数据库；输出到同目录 Location诊断_未命中.csv
"""

import csv
import os
import re
import sqlite3

NS_DB = r'd:\BYP\Project\BYP\BYPsNotes\db\national_sites.db'
AD_DB = r'd:\BYP\Project\BYP\BYPsNotes\db\admin_division.db'
OUT_CSV = os.path.join(
    os.path.dirname(os.path.abspath(__file__)),
    'Location诊断_未命中.csv'
)

# 省名统一（处理数据库里的变体/错别字）
PROVINCE_ALIAS = {
    '新疆维吾尔自区': '新疆维吾尔自治区',
}


def load_division(ad_db):
    """读行政区划库，按省名组织

    返回:
        provinces: {省名: {
            'code': 省code,
            'cities': [市名按长度倒序],
            'city_code': {市名: 市code},
            'counties': [县名按长度倒序],
            'county_info': {县名: (县code, 父级市名 or 省名, 父级code)},
        }}
    """
    conn = sqlite3.connect(ad_db)
    conn.row_factory = sqlite3.Row
    cur = conn.cursor()

    provinces = {}

    # 1. 省级
    cur.execute('SELECT code, name FROM administrative_division WHERE level = 1')
    for row in cur.fetchall():
        provinces[row['name']] = {
            'code': row['code'],
            'cities': [],
            'city_code': {},
            'counties': [],
            'county_info': {},
        }

    # 2. 市级：按 parent_code 找所属省
    cur.execute('SELECT code, name, parent_code FROM administrative_division WHERE level = 2')
    city_to_prov = {}  # 市code → 省名
    for row in cur.fetchall():
        # 反查父级省名
        cur2 = conn.cursor()
        cur2.execute('SELECT name FROM administrative_division WHERE code = ?', (row['parent_code'],))
        pr = cur2.fetchone()
        if not pr:
            continue
        prov_name = pr['name']
        if prov_name not in provinces:
            continue
        provinces[prov_name]['cities'].append(row['name'])
        provinces[prov_name]['city_code'][row['name']] = row['code']
        city_to_prov[row['code']] = prov_name

    # 3. 县级：parent 可能是市（含地级市/州/盟），也可能是省（直辖市下区、省直辖县级市）
    cur.execute('SELECT code, name, parent_code FROM administrative_division WHERE level = 3')
    for row in cur.fetchall():
        pcode = row['parent_code']
        if pcode in city_to_prov:
            prov_name = city_to_prov[pcode]
            cur2 = conn.cursor()
            cur2.execute('SELECT name FROM administrative_division WHERE code = ?', (pcode,))
            parent_name = cur2.fetchone()['name']
        else:
            # parent 是省
            cur2 = conn.cursor()
            cur2.execute('SELECT name FROM administrative_division WHERE code = ?', (pcode,))
            pr = cur2.fetchone()
            if not pr:
                continue
            prov_name = pr['name']
            parent_name = prov_name  # 无市级，市字段留空或等于省名
        if prov_name not in provinces:
            continue
        provinces[prov_name]['counties'].append(row['name'])
        provinces[prov_name]['county_info'][row['name']] = (row['code'], parent_name, pcode)

    conn.close()

    # 按长度倒序，确保"鄂伦春自治旗"先于"春旗"之类的短匹配
    for p in provinces.values():
        p['cities'].sort(key=lambda x: -len(x))
        p['counties'].sort(key=lambda x: -len(x))

    return provinces


def strip_province(location, province):
    """去掉 Location 开头的省名前缀（兼容别名/短名）"""
    if not location:
        return ''
    loc = location.strip()
    candidates = [province]
    # 简短变体
    simple = {
        '内蒙古自治区': ['内蒙古'],
        '广西壮族自治区': ['广西'],
        '西藏自治区': ['西藏'],
        '宁夏回族自治区': ['宁夏'],
        '新疆维吾尔自治区': ['新疆'],
    }
    candidates.extend(simple.get(province, []))
    # 按长度倒序
    candidates.sort(key=lambda x: -len(x))
    for c in candidates:
        if loc.startswith(c):
            return loc[len(c):]
    return loc


def diagnose_one(location, province, provinces):
    """对单条记录判定其 Location 归属

    返回 (category, hit_name, city_name):
        category: 'A'已含市, 'B'命中县可回填, 'C'未命中
        hit_name: 匹配到的市名或县名
        city_name: 归属的市名（分类C时为空）
    """
    # 省名标准化
    prov = PROVINCE_ALIAS.get(province, province)
    if prov not in provinces:
        return 'C', '', ''

    p = provinces[prov]
    tail = strip_province(location, prov)
    if not tail:
        return 'C', '', ''

    # 分类 A：以某个市级名开头
    for city in p['cities']:
        if tail.startswith(city):
            return 'A', city, city

    # 分类 B：以某个县级名开头
    for county in p['counties']:
        if tail.startswith(county):
            _, parent_name, _ = p['county_info'][county]
            # parent_name 可能是市名，也可能是省名（直辖市的区）
            city_name = parent_name if parent_name != prov else prov
            return 'B', county, city_name

    return 'C', '', tail[:12]  # 返回前 12 字供核对


def main():
    print(f'加载行政区划库：{AD_DB}')
    provinces = load_division(AD_DB)
    print(f'  覆盖省级 {len(provinces)} 个')
    sample = next(iter(provinces.values()))
    print(f'  示例：某省市级 {len(sample["cities"])}，县级 {len(sample["counties"])}\n')

    print(f'扫描国保库：{NS_DB}')
    conn = sqlite3.connect(NS_DB)
    conn.row_factory = sqlite3.Row
    cur = conn.cursor()
    cur.execute('''
        SELECT Id, NameC, Province, Location, Serial
        FROM NationalHistoricalandCulturalSites
        ORDER BY Id ASC
    ''')
    records = cur.fetchall()
    conn.close()
    print(f'  共 {len(records)} 条\n')

    stats_total = {'A': 0, 'B': 0, 'C': 0}
    stats_by_batch = {}  # {batch: {'A':n,'B':n,'C':n}}
    unmatched = []

    for r in records:
        cat, hit, city = diagnose_one(r['Location'] or '', r['Province'] or '', provinces)
        stats_total[cat] += 1
        b = r['Serial'] or '（未知）'
        stats_by_batch.setdefault(b, {'A': 0, 'B': 0, 'C': 0})[cat] += 1
        if cat == 'C':
            unmatched.append({
                'Id': r['Id'],
                'Serial': r['Serial'],
                'Province': r['Province'],
                'NameC': r['NameC'],
                'Location': r['Location'],
                'Tail': hit,  # 去省名后的前 12 字
            })

    total = len(records)
    print('=' * 70)
    print('总体分类统计')
    print('=' * 70)
    print(f'  A 已含市      : {stats_total["A"]:>5}  ({stats_total["A"]/total*100:>5.1f}%)')
    print(f'  B 命中县(可回填): {stats_total["B"]:>5}  ({stats_total["B"]/total*100:>5.1f}%)')
    print(f'  C 未命中(需人工): {stats_total["C"]:>5}  ({stats_total["C"]/total*100:>5.1f}%)')
    print(f'  合计          : {total:>5}')

    print()
    print('=' * 70)
    print('分批次统计')
    print('=' * 70)
    batch_order = {'第一批': 1, '第二批': 2, '第三批': 3, '第四批': 4,
                   '第五批': 5, '第六批': 6, '第七批': 7, '第八批': 8}
    print(f'  {"批次":<8}{"A已含市":>10}{"B命中县":>10}{"C未命中":>10}{"合计":>8}')
    print('  ' + '-' * 46)
    for b in sorted(stats_by_batch.keys(), key=lambda x: batch_order.get(x, 999)):
        s = stats_by_batch[b]
        total_b = s['A'] + s['B'] + s['C']
        print(f'  {b:<8}{s["A"]:>10}{s["B"]:>10}{s["C"]:>10}{total_b:>8}')

    # 未命中清单写 CSV
    if unmatched:
        with open(OUT_CSV, 'w', encoding='utf-8-sig', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=['Id', 'Serial', 'Province', 'NameC', 'Location', 'Tail'])
            writer.writeheader()
            writer.writerows(unmatched)
        print(f'\n未命中清单已写入：{OUT_CSV}')
        print(f'（共 {len(unmatched)} 条，供人工核对）')

        # 额外提示：未命中清单中按省份再分组
        by_prov = {}
        for u in unmatched:
            by_prov.setdefault(u['Province'], 0)
            by_prov[u['Province']] += 1
        print('\n未命中按省份分布（Top 10）：')
        for prov, cnt in sorted(by_prov.items(), key=lambda x: -x[1])[:10]:
            print(f'  {prov:<14} {cnt}')

    print('\n完成。')


if __name__ == '__main__':
    main()
