添加词频调序输出
This commit is contained in:
parent
5d7318e5f8
commit
e818c25acf
|
@ -0,0 +1,7 @@
|
||||||
|
# 拼音猜成语工具
|
||||||
|
## 说明
|
||||||
|
模式0:输入拼音字数,提供候选
|
||||||
|
模式1:输入猜测历史,提供筛选结果
|
||||||
|
|
||||||
|
## 引用
|
||||||
|
[成语数据库来源](https://github.com/crazywhalecc/idiom-database)
|
File diff suppressed because it is too large
Load Diff
33
stat.py
33
stat.py
|
@ -2,6 +2,7 @@ import sys
|
||||||
import getopt
|
import getopt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import re
|
import re
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
def trim_space(s):
|
def trim_space(s):
|
||||||
|
@ -15,10 +16,11 @@ def main(argv):
|
||||||
parameter = '3323'
|
parameter = '3323'
|
||||||
mode = '1'
|
mode = '1'
|
||||||
parameter = 'bai vvv vv vvv,012 000 00 000;bai tou er xin,012 010 10 001'
|
parameter = 'bai vvv vv vvv,012 000 00 000;bai tou er xin,012 010 10 001'
|
||||||
|
num = 3
|
||||||
try:
|
try:
|
||||||
opts, args = getopt.getopt(argv, "hm:p:", ["mode=", "parameter="])
|
opts, args = getopt.getopt(argv, "hm:p:n:", ["mode=", "parameter=", "num="])
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
print('test.py -m <mode> -p <parameter>')
|
print('test.py -m <mode> -p <parameter> -n <num>')
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
for opt, arg in opts:
|
for opt, arg in opts:
|
||||||
if opt == '-h':
|
if opt == '-h':
|
||||||
|
@ -28,13 +30,17 @@ def main(argv):
|
||||||
mode = arg
|
mode = arg
|
||||||
elif opt in ("-p", "--parameter"):
|
elif opt in ("-p", "--parameter"):
|
||||||
parameter = arg
|
parameter = arg
|
||||||
df = pd.read_json('idiom.json')
|
elif opt in ("-n", "--num"):
|
||||||
|
num = int(arg)
|
||||||
|
all_idiom = pd.read_json('idiom.json')
|
||||||
|
idiom_frequency = pd.read_csv('idiom_frequency.csv')
|
||||||
|
all_idiom = all_idiom.merge(idiom_frequency, how='left', on='word')
|
||||||
|
all_idiom['frequency'] = all_idiom['frequency'].fillna(1).astype(int)
|
||||||
if mode == '0':
|
if mode == '0':
|
||||||
df['pinyin_rt'] = df.apply(lambda x: ''.join(map(lambda y: str(len(y)), re.split('[ ,,]',x['pinyin_r']))), axis=1)
|
all_idiom['pinyin_rt'] = all_idiom.apply(lambda x: ''.join(map(lambda y: str(len(y)), re.split('[ ,,]',x['pinyin_r']))), axis=1)
|
||||||
groups = df.groupby(by='pinyin_rt')
|
groups = all_idiom.groupby(by='pinyin_rt')
|
||||||
group = groups.get_group(parameter).copy()
|
group = groups.get_group(parameter).copy()
|
||||||
group['pinyin_c'] = group.apply(lambda x: len(set(trim_space(x['pinyin_r']))), axis=1)
|
print_max_group(all_idiom, group, num)
|
||||||
print(df.loc[group['pinyin_c'].idxmax()])
|
|
||||||
elif mode == '1':
|
elif mode == '1':
|
||||||
parameter_rst = parameter.split(';', 1)
|
parameter_rst = parameter.split(';', 1)
|
||||||
if len(parameter_rst) > 1:
|
if len(parameter_rst) > 1:
|
||||||
|
@ -47,8 +53,8 @@ def main(argv):
|
||||||
parameter = trim_space(parameter.split(',')[0])
|
parameter = trim_space(parameter.split(',')[0])
|
||||||
|
|
||||||
count = ''.join([str(len(x)) for x in hits.split()])
|
count = ''.join([str(len(x)) for x in hits.split()])
|
||||||
df['pinyin_rt'] = df.apply(lambda x: ''.join(map(lambda y: str(len(y)), re.split('[ ,,]',x['pinyin_r']))), axis=1)
|
all_idiom['pinyin_rt'] = all_idiom.apply(lambda x: ''.join(map(lambda y: str(len(y)), re.split('[ ,,]',x['pinyin_r']))), axis=1)
|
||||||
groups = df.groupby(by='pinyin_rt')
|
groups = all_idiom.groupby(by='pinyin_rt')
|
||||||
group = groups.get_group(count).copy()
|
group = groups.get_group(count).copy()
|
||||||
|
|
||||||
while(True):
|
while(True):
|
||||||
|
@ -67,8 +73,13 @@ def main(argv):
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
group['pinyin_c'] = group.apply(lambda x: len(set(trim_space(x['pinyin_r']))), axis=1)
|
print_max_group(all_idiom, group, num)
|
||||||
print(df.loc[group['pinyin_c'].idxmax()])
|
|
||||||
|
def print_max_group(all_idiom, group, num):
|
||||||
|
group['pinyin_c'] = group.apply(lambda x: (math.log(x['frequency'], 2)/16 + 1) * len(set(trim_space(x['pinyin_r']))), axis=1)
|
||||||
|
list = group.nlargest(num, ['pinyin_c', 'frequency']).index.tolist()
|
||||||
|
for i in list:
|
||||||
|
print(all_idiom.loc[i])
|
||||||
|
|
||||||
def filter_group_mode1(parameter, group, hits):
|
def filter_group_mode1(parameter, group, hits):
|
||||||
for i in range(len(parameter)):
|
for i in range(len(parameter)):
|
||||||
|
|
Loading…
Reference in New Issue