chinese-wordle-tool/main.py

351 lines
14 KiB
Python
Raw Permalink Normal View History

2022-02-09 17:57:00 +08:00
import getopt
2022-02-10 10:42:26 +08:00
import json
2022-02-11 16:53:31 +08:00
import math
import os
import re
import sys
import traceback
2022-02-24 11:03:54 +08:00
from numpy import NaN
import requests, time
2022-02-11 16:53:31 +08:00
import pandas as pd
from flask import Flask, jsonify, render_template, request
2022-02-10 10:13:41 +08:00
from flask_cors import CORS
2022-02-11 16:53:31 +08:00
from pypinyin import Style, lazy_pinyin, style
2022-02-09 17:57:00 +08:00
2022-02-10 10:13:41 +08:00
app = Flask(__name__)
app.config['JSON_AS_ASCII'] = False
CORS(app)
2022-02-10 16:51:58 +08:00
TONE_TABLE = [
set(['ā', 'ē', 'ī', 'ō', 'ū', 'ǖ']),
set(['á', 'é', 'í', 'ó', 'ú', 'ǘ']),
set(['ǎ', 'ě', 'ǐ', 'ǒ', 'ǔ', 'ǚ']),
set(['à', 'è', 'ì', 'ò', 'ù', 'ǜ']),
2022-02-24 11:03:54 +08:00
set(['a', 'e', 'i', 'o', 'u', 'ü'])
2022-02-10 16:51:58 +08:00
]
2022-02-24 11:03:54 +08:00
TONE_MAP = {
'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ǖ': 'v',
'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', 'ǘ': 'v',
'ǎ': 'a', 'ě': 'e', 'ǐ': 'i', 'ǒ': 'o', 'ǔ': 'u', 'ǚ': 'v',
'à': 'a', 'è': 'e', 'ì': 'i', 'ò': 'o', 'ù': 'u', 'ǜ': 'v',
'a': 'a', 'e': 'e', 'i': 'i', 'o': 'o', 'u': 'u', 'ü': 'v'
}
INITIAL_WEIGHT = {"y":12789,"j":9523,"sh":9043,"zh":7735,"x":7656,"b":7649,"l":7240,"d":6712,"h":5992,"q":5711,"g":5614,"w":5475,"m":5111,"f":5059,"ch":4965,"t":4680,"r":3906,"z":3186,"s":3149,"c":2236,"k":2210,"":2142,"n":2079,"p":1866,"":549,"":1}
FINAL_WEIGHT = {"i":21826,"u":18074,"an":7550,"ing":5519,"ian":5083,"ou":4657,"en":4652,"eng":4648,"ong":4567,"ang":4460,"ao":4450,"e":4446,"in":3827,"ai":3721,"uan":3337,"uo":3279,"ei":3205,"a":3141,"ui":2936,"un":2366,"iao":2299,"iu":1937,"iang":1845,"ie":1786,"ue":1367,"o":1356,"ia":1231,"uang":1061,"er":1008,"ua":864,"":549,"v":417,"iong":381,"uai":373,"ve":59,"":1}
2022-02-10 16:51:58 +08:00
def tone_only(pinyin, **kwargs):
2022-02-24 11:03:54 +08:00
for i in range(5):
2022-02-10 16:51:58 +08:00
if len(set(pinyin).intersection(TONE_TABLE[i])) > 0:
return str(i + 1)
2022-02-24 11:03:54 +08:00
return pinyin
def pinyin_to_tone(pinyin, **kwargs):
pinyin = ''.join(filter(str.isdigit, pinyin))
if(len(pinyin) <= 0):
return '5'
if(pinyin[0] == '1'):
return '1'
elif (pinyin[0] == '2'):
return '2'
elif (pinyin[0] == '3'):
return '3'
elif (pinyin[0] == '4'):
return '4'
2022-02-10 16:51:58 +08:00
return '5'
2022-02-24 11:03:54 +08:00
def pinyin_only(pinyin, **kwargs):
for i in range(5):
if len(set(pinyin).intersection(TONE_TABLE[i])) > 0:
for s in TONE_TABLE[i]:
pinyin = pinyin.replace(s, TONE_MAP[s])
return pinyin
def merge_pinyin_tone(pinyin, tone):
plist = pinyin.split()
for i, t in enumerate(tone):
if(tone != '5'):
plist[i] = plist[i] + t
return ' '.join(plist)
2022-02-10 16:51:58 +08:00
style.register('TONE_ONLY', tone_only)
2022-02-09 17:57:00 +08:00
def trim_space(s):
return s.replace(' ', '').replace(',', '').replace('', '')
def remove_at(s, i):
return s[0:i] + s[i+1:]
def compute_pinyin(s, style=None):
if style is None:
return ' '.join(lazy_pinyin(s))
else:
return ' '.join(lazy_pinyin(s, style=style))
2022-02-10 17:01:28 +08:00
all_idiom = None
2022-02-24 11:03:54 +08:00
def query_word(word):
# {"tipsList":[{"fpindex":"andmkbblbeoojhcp","fpinyin":"bā miàn zhāng luo","fname":"八面张罗"}],"found":"true"}
ret = requests.get("https://cy.hwxnet.com/searchtips.do?wd=%s&timestamp=%d&qt=1" % (word,int(time.time() * 1000)))
time.sleep(0.1)
data = ret.json()
if len(data["tipsList"]) > 0:
return data["tipsList"][0]['fpinyin']
return ''
2022-02-10 17:01:28 +08:00
def init():
global all_idiom
2022-02-09 17:57:00 +08:00
if os.path.exists("all_idiom.csv"):
all_idiom = pd.read_csv('all_idiom.csv')
2022-02-24 11:26:05 +08:00
# all_idiom["pinyin_initials"].str.split(",").explode("splited").value_counts().to_json()
# all_idiom["pinyin_finals"].str.split(",").explode("splited").value_counts().to_json()
# four_idiom = all_idiom[all_idiom['word'].str.len() == 4]
# four_idiom["tone_len"] = four_idiom.apply(lambda x: len(set(x['pinyin_tone'])), axis=1)
# four_idiom = four_idiom[four_idiom['tone_len'] == 4]
# four_idiom["pinyin_tone"].value_counts().to_json()
2022-02-09 17:57:00 +08:00
else:
all_idiom = pd.read_json('idiom.json')
idiom_frequency = pd.read_csv('idiom_frequency.csv')
all_idiom = all_idiom.merge(idiom_frequency, how='outer', on='word')
all_idiom['frequency'] = all_idiom['frequency'].fillna(1).astype(int)
2022-02-24 11:03:54 +08:00
all_idiom['pinyin'] = all_idiom.apply(lambda x: compute_pinyin(x['word'], style=Style.TONE) if x['pinyin'] != NaN else x['pinyin'], axis=1)
all_idiom['pinyin_r'] = all_idiom.apply(lambda x: pinyin_only(x['pinyin']) if x['pinyin_r'] != NaN else x['pinyin_r'], axis=1)
2022-02-10 17:35:40 +08:00
all_idiom['pinyin_rt'] = all_idiom.apply(lambda x: int(''.join(map(lambda y: str(len(y)), re.split('[ ,]',x['pinyin_r'])))), axis=1)
2022-02-24 11:03:54 +08:00
all_idiom['pinyin_tone'] = all_idiom.apply(lambda x: ''.join([tone_only(y) for y in x['pinyin'].split()]), axis=1)
2022-02-10 17:12:31 +08:00
all_idiom['pinyin_initials' ] = all_idiom.apply(lambda x: ','.join(list(lazy_pinyin(x['word'], style=Style.INITIALS, strict=False))), axis=1)
all_idiom['pinyin_finals'] = all_idiom.apply(lambda x: ','.join(list(lazy_pinyin(x['word'], style=Style.FINALS, strict=False))), axis=1)
2022-02-09 17:57:00 +08:00
all_idiom.to_csv("all_idiom.csv")
2022-02-10 17:01:28 +08:00
2022-02-11 16:53:31 +08:00
def fetch_next_input(parameter):
parameter_rst = parameter.split(';', 1)
if len(parameter_rst) > 1:
parameter_rst = parameter_rst[1]
else:
parameter_rst = ''
parameter = parameter.split(';')[0]
return parameter,parameter_rst
def filter_group_mode1(parameter, group, hits):
for i in range(len(parameter)):
key0 = 'pinyin0_%d' % i
key1 = 'pinyin1_%d' % i
key2 = 'pinyin2_%d' % i
group[key0] = group.apply(lambda x: ''.join(set(trim_space(x['pinyin_r']))), axis=1)
group[key1] = group.apply(lambda x: ''.join(set(remove_at(trim_space(x['pinyin_r']), i))), axis=1)
group[key2] = group.apply(lambda x: trim_space(x['pinyin_r'])[i], axis=1)
includes = set()
for i, hit in enumerate(list(trim_space(hits))):
if hit == '2':
includes.add(parameter[i])
for i, hit in enumerate(list(trim_space(hits))):
key0 = 'pinyin0_%d' % i
key1 = 'pinyin1_%d' % i
key2 = 'pinyin2_%d' % i
target = parameter[i]
if hit == '0' and target not in includes:
group = group[~group[key0].str.contains(target, regex=False, case=True, na=False)]
elif hit == '1':
group = group[(group[key2] != target) & (group[key1].str.contains(target, regex=False, case=True, na=False))]
includes.add(target)
elif hit == '2':
group = group[group[key2] == target]
if len(group) <= 1:
break
return group
def filter_with_target_field(group, field_name, values, value_hits):
includes_value = set()
for i in range(4):
if value_hits[i] == '2':
includes_value.add(values[i])
for i in range(4):
if value_hits[i] == '0' and values[i] not in includes_value:
group = group[~group[field_name].str.contains(values[i], regex=False, case=True, na=False)]
elif value_hits[i] == '1':
group = group[(group[field_name].str[i] != values[i]) & group[field_name].str.contains(values[i], regex=False, case=True, na=False)]
includes_value.add(values[i])
elif value_hits[i] == '2':
group = group[group[field_name].str[i] == values[i]]
if len(group) <= 1:
break
return group
def filter_group_model2(parameter, group, hits, tones, tone_hits, word_hits):
group = filter_with_target_field(group, 'pinyin_tone', tones, tone_hits)
if len(group) <= 1:
return group
group = filter_with_target_field(group, 'word', parameter, word_hits)
if len(group) <= 1:
return group
hits = hits.split()
for i in range(4):
target = parameter[i]
targets = list()
targets.append(lazy_pinyin(target, style=Style.INITIALS, strict=False)[0])
targets.append(lazy_pinyin(target, style=Style.FINALS, strict=False)[0])
pinyin_hit = hits[i]
for j,name in enumerate(['initials', 'finals']):
key = 'pinyin_%s' % name
if pinyin_hit[j] == '0':
group = group[group[key].str.count('(^|[,])%s([,]|$)' + targets[j]) == 0]
elif pinyin_hit[j] == '1':
group = group[(group[key].str.count('(^|[,])%s([,]|$)' % targets[j]) > 0) & (group[key].str.count(('^(\w*,){%d}%s([,]|$)' % (i, targets[j]))) == 0)]
elif pinyin_hit[j] == '2':
group = group[group[key].str.count(('^(\w*,){%d}%s([,]|$)' % (i, targets[j]))) > 0]
if len(group) <= 1:
return group
return group
2022-02-10 17:01:28 +08:00
def filter_logic(mode, parameter):
global all_idiom
2022-02-09 17:57:00 +08:00
if mode == '0':
groups = all_idiom.groupby(by='pinyin_rt')
2022-02-10 17:35:40 +08:00
group = groups.get_group(int(parameter)).copy()
2022-02-10 10:13:41 +08:00
return all_idiom, group
2022-02-09 17:57:00 +08:00
elif mode == '1':
2022-02-11 16:53:31 +08:00
parameter, parameter_rst = fetch_next_input(parameter)
2022-02-09 17:57:00 +08:00
hits = parameter.split(',')[1]
parameter = trim_space(parameter.split(',')[0])
count = ''.join([str(len(x)) for x in hits.split()])
groups = all_idiom.groupby(by='pinyin_rt')
2022-02-10 17:35:40 +08:00
group = groups.get_group(int(count)).copy()
2022-02-09 17:57:00 +08:00
while(True):
group = filter_group_mode1(parameter, group, hits)
if(len(group) > 1 and len(parameter_rst) > 0):
2022-02-11 17:55:11 +08:00
parameter, parameter_rst = fetch_next_input(parameter_rst)
2022-02-11 16:53:31 +08:00
2022-02-09 17:57:00 +08:00
hits = parameter.split(',')[1]
parameter = trim_space(parameter.split(',')[0])
elif len(group) <= 0:
2022-02-10 10:13:41 +08:00
break
2022-02-09 17:57:00 +08:00
else:
break
2022-02-10 10:13:41 +08:00
return all_idiom, group
2022-02-09 17:57:00 +08:00
elif mode == '2':
2022-02-10 17:01:28 +08:00
four_idiom = all_idiom[all_idiom['word'].str.len() == 4]
group = four_idiom[four_idiom['pinyin_tone'].str.startswith(parameter)].copy()
return four_idiom, group
2022-02-09 17:57:00 +08:00
elif mode == '3':
2022-02-10 17:01:28 +08:00
four_idiom = all_idiom[all_idiom['word'].str.len() == 4]
2022-02-11 16:53:31 +08:00
parameter, parameter_rst = fetch_next_input(parameter)
2022-02-09 17:57:00 +08:00
hits = parameter.split(',')[1]
parameter = parameter.split(',')[0]
2022-02-11 16:53:31 +08:00
tones = ''.join(lazy_pinyin(parameter, style="TONE_ONLY"))
2022-02-09 17:57:00 +08:00
tone_hits=hits[-4:]
2022-02-10 09:36:32 +08:00
word_hits=hits[-9:-5]
hits=hits[:-10]
2022-02-09 17:57:00 +08:00
2022-02-10 17:01:28 +08:00
group = four_idiom.copy()
2022-02-09 17:57:00 +08:00
while(True):
2022-02-10 09:36:32 +08:00
group = filter_group_model2(parameter, group, hits, tones, tone_hits, word_hits)
2022-02-09 17:57:00 +08:00
if(len(group) > 1 and len(parameter_rst) > 0):
2022-02-11 17:55:11 +08:00
parameter, parameter_rst = fetch_next_input(parameter_rst)
2022-02-09 17:57:00 +08:00
hits = parameter.split(',')[1]
parameter = parameter.split(',')[0]
2022-02-11 16:53:31 +08:00
tones = ''.join(lazy_pinyin(parameter, style="TONE_ONLY"))
2022-02-09 17:57:00 +08:00
tone_hits=hits[-4:]
2022-02-10 09:36:32 +08:00
word_hits=hits[-9:-5]
hits=hits[:-10]
2022-02-09 17:57:00 +08:00
elif len(group) <= 0:
2022-02-10 10:13:41 +08:00
break
2022-02-09 17:57:00 +08:00
else:
break
2022-02-10 17:01:28 +08:00
return four_idiom, group
2022-02-11 09:19:18 +08:00
elif mode == '4':
four_idiom = all_idiom[all_idiom['word'].str.len() == 4]
group = four_idiom[four_idiom['word'].str.count(parameter) > 0].copy()
return four_idiom, group
elif mode == '5':
four_idiom = all_idiom[all_idiom['word'].str.len() == 4]
group = four_idiom[four_idiom['pinyin_r'].str.count(parameter) > 0].copy()
return four_idiom, group
2022-02-10 10:13:41 +08:00
return None, None
2022-02-09 17:57:00 +08:00
def print_max_group(all_idiom, group, num):
2022-02-10 10:13:41 +08:00
for item in get_max_group(all_idiom, group, num):
print(item)
2022-02-24 11:03:54 +08:00
def get_pinyin_weight(initials, finals):
initials = set(initials.split(','))
weight = sum([INITIAL_WEIGHT[w] for w in initials])
finals = set(finals.split(','))
weight += sum([FINAL_WEIGHT[w] for w in finals])
return weight
2022-02-10 10:13:41 +08:00
def get_max_group(all_idiom, group, num):
2022-02-11 09:04:52 +08:00
result = list()
if(len(group) == 0):
return result
2022-02-24 11:03:54 +08:00
group['pinyin_c'] = group.apply(lambda x: (math.log(x['frequency'], 2)/16 + 1) * get_pinyin_weight(x['pinyin_initials'], x['pinyin_finals']), axis=1)
2022-02-10 10:13:41 +08:00
ret_list = group.nlargest(num, ['pinyin_c', 'frequency']).index.tolist()
for i in ret_list:
2022-02-10 10:42:26 +08:00
result.append(json.loads(all_idiom.loc[i].to_json(orient = 'index',force_ascii=False)))
2022-02-10 10:13:41 +08:00
return result
2022-02-09 17:57:00 +08:00
2022-02-10 10:13:41 +08:00
@app.route('/predict', methods=['GET'])
def predict():
2022-02-11 16:53:31 +08:00
try:
mode = request.args.get('mode')
parameter = request.args.get('parameter')
if len(trim_space(parameter)) == 0:
return jsonify({'status': 1, 'message': '请输入参数', 'result': []})
all_idiom, group = filter_logic(mode, parameter)
result = get_max_group(all_idiom, group, 6)
return jsonify({'status': 0, 'message': 'success', 'result': result})
except:
traceback.print_exc()
return jsonify({'status': 2, 'message': '执行出错', 'result': []})
2022-02-10 10:13:41 +08:00
2022-02-10 15:00:59 +08:00
@app.route('/', methods=['GET'])
def index():
return render_template('index.html')
2022-02-11 16:53:31 +08:00
def main(argv):
# mode = '0'
# parameter = '3323'
# mode = '1'
# parameter = 'bai vvv vv vvv,012 000 00 000;bai tou er xin,012 010 10 001'
2022-02-24 11:03:54 +08:00
mode = '2'
2022-02-24 11:26:05 +08:00
parameter = '1243'
2022-02-24 11:03:54 +08:00
# mode = '3'
# parameter = '行之有效,00 11 10 00 0100 1101;各执一词,00 11 12 22 0010 1022'
2022-02-11 16:53:31 +08:00
num = 6
try:
opts, args = getopt.getopt(argv, "hm:p:n:", ["mode=", "parameter=", "num="])
except getopt.GetoptError:
print('test.py -m <mode> -p <parameter> -n <num>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('test.py -i <mode> -o <parameter>')
sys.exit()
elif opt in ("-m", "--mode"):
mode = arg
elif opt in ("-p", "--parameter"):
parameter = arg
elif opt in ("-n", "--num"):
num = int(arg)
all_idiom, group = filter_logic(mode, parameter)
if all_idiom is not None and len(group) > 0:
print_max_group(all_idiom, group, num)
else:
print('未找到匹配项')
2022-02-09 17:57:00 +08:00
if __name__ == '__main__':
current_work_dir = os.path.dirname(__file__)
os.chdir(current_work_dir)
2022-02-10 17:01:28 +08:00
init()
2022-02-24 11:26:47 +08:00
# main(sys.argv[1:])
app.run(debug=True, host="0.0.0.0")