From 1b2fd4868bd851167ff16074727b08f36b461e0f Mon Sep 17 00:00:00 2001 From: zhiyang7 Date: Wed, 9 Feb 2022 17:57:00 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AF=B9=E6=B1=89=E5=85=9C?= =?UTF-8?q?=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- __pycache__/main.cpython-38.pyc | Bin 0 -> 7474 bytes all_idiom.csv | 31857 ++++++++++++++++++++++++++++++ main.py | 233 + requirements.txt | 1 + stat.py | 114 - 5 files changed, 32091 insertions(+), 114 deletions(-) create mode 100644 __pycache__/main.cpython-38.pyc create mode 100644 all_idiom.csv create mode 100644 main.py delete mode 100644 stat.py diff --git a/__pycache__/main.cpython-38.pyc b/__pycache__/main.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b43d96780b1184a6d427f049e3cfd4bc37b1ee4 GIT binary patch literal 7474 zcmcIp>u(%a6`wn=oqhQg$Bq*>-K1&jZR6B-QYfhtTIhp_YFkLkV@O$Td?)tWyR+*% z<1}8)3XoHpR0Nq)Ap{8R3JKLvLk=BaA)#rTfNCXCwQQ@Jw3F48oni{txPDEs)7;=DV20n%6WqGS?5xwnlRU-K zJj1iRXVzR{=;`Hsyq^#7eS9$P+0TdgFh9UY_`$g65Fh1t@WcF0o}->w-R|W__))Ch z$M53D0Q>pf{2ssoelH&b+~*AP{DLa}?DRXz{C<8PXD_HD$_fL0?7WgczK+u>g}fG; z^|HHMcEfa~)L1UY4dcAOTybU?A;o`G*YSNEpErskP+C}vt8Eofi=j>s>l=mJEEg)b z+3_U}xPSlU*YYe(2&Y~t%{ZtsZ%q-z`rG$De){6`HL>8;F>=u^xz5FBmi@V!OTE0} zT&$ML?uq(xnDRxrT9jxO(V(JGn8=` zVm+$ZhuRXc=C@yoz&9iCV+nqDGXhcT_uqPlFn`mIz*+=;5`mvb;MNkM{Qlb!___qY z{chA+jljzhcqIbgc}K3vlu0-t23kBYCXC{3`j#W)!G znDX4`nkrYDny+$gh1w4_bzk%KKwV)i!#4wcL}{}X#`OhV9&WvBKxnb(U(}l;27aslCQ;&ycsa|(n4m4kg0b0w5 zG|&`|w^Z>%Ru8qgvM)%aJEd;TI)2fwxy~UBdP67_X0jfpvkW`P1^`W27o>>W_&SUZ zx<{l}-UXb2vnEs#ywm`7Km*VOOaNMd$)?sA^>OP1Wy6ZiYHjVJ+SHrIXBBr$hivBA zLQ2drg?^Ao4_O>vYw5g~c=~z8RX?W$I;2#7F1i>QSTM!+ z1MDwb`_Os~EmgJ#(QWFpq}PKRF}QOs3iky zJ`*G(3wu**T7kt6jVP@wA8oT%k8iXY^-L?k>m|E;O@aNr^QMa4T-rDci&{qRFG;71i1XIeQl+g_6GwiNC(;q^l?>p-^5B9 zR#L^=f!f;7?*V;kGvf~h8EEt9f{s1j3Nn0jL}_MRo!{HcM0jn+$Iz$BRvsG z)x;@z))U*#`T)rYvVLGhnLpS%MDl@lbW2EcN3l)<>zLx1JtU-wOS_zAlLsW%&e^ug zhor|RqKB&48)Q56KJ+Vmk{&SNWIOe&qr=&HT{~ zCFr}ZjKB)uJi53K@=V7ZxFhJLkv8i^*-X2aV@%RbUCeD_5>|w}pmooFB$_=I&nEus zVs+QqMERyBG05p%x)F=7u4;?#25Fq=)Rq&a`Gb%c^?qnu?}yO~{^DkVPZDPR_`Iv^ zxeGY4N)$Ky{lh`Of9HITKg2%*doTcf8h{RufkHzZh)0jaqx*uaoHfWFYVOB+y^??b zy21GA>&kU@SshZEL&4y6=#1p0A-;M&TFVp%(_}bg&;Xd|S33gM8lJde1nyGiM7So3QnhC#Oz^Y_jq2rBXR}<;oR&1ddNk zPUR*hCh!rEqke5E=ZM_ZvKx&~PUP@!Vq!96Q?@=ib@EimPA(nAtF``vumA1!mG$qw zy57E(qk-caE9Isz34iPKxHS@U)zkIU?&*e6t#Ri}BULYnQq{q8e5RqhOVu-tyzh8E ze9YWcU8BU`Feie+A|RZq-#R~o6PI`7ro#y(juUKY;clH<-S6M}8j;OvP| zCoeg#+S*+0d0T~Rk?f#J4`th%Pfjke|&Z zaS~JVOuCWqP1)5hM`XHEs$Sxyv!4LsKk+GzA?dkKS86k*ig)(J2ErP%_0rX{CmsY+ z7kT7gN%Ba-5h>!}26;Z4hkJ4p0Y-;Ny&y;=l0MOgT3(Yx3R=k4$+|c;McSep98hHC=j^p;j%` zLsdB92rar7UphP5$qo7Yd1kAZmT$9%4@h#D`JcariJbO&&FcHD|woVUHwgBD`xH*vW?tv~9$Yvj3k& zC>|!V=@{Y+6}#xzMe3J2_ED-mLIo+lc$A90B+!|)L;=BHzxSiR-gxsb*I)n3*WOrv z;pTe#jjn_4(6Z=yNG(&N3$n&!2(aglg2*8{?Q|@2iYnIMeeb>Z*8hz{9LCsxzpZ$Z z)+1Z6=Z-JXAnn+-DupFdg=o*{;`u6$1Ow%3^ExG1ZX(p9n0d`82vS3N|DV8pqR==r z5lWS3d<0U&K9Qq>I3$fiXq_vZyKwH&r_S$5+A860V@BC6`~M4_;*(f%4|>ce&K2H| zns0Qd$;6_$Ty{&9Cj{NoMl7zodGT)EMIl|tefYdV6p>Q2l(s?+47nZPAJl`?5$e$~ z3E(1ynjxFtVK0Ax-rXhkf%}MOXzr(|kUIZqs>$opvBp2#Sp7o&XlNp?t<`-y{iKAC z3sDoHQ5T4!5hyQvq3&Eon5|J;t)W>j`E#L(PS5v3wa#rzI3-@BmlAm!)aWG*jjAJN z9oxKIu2kHTZI+N@sXL*85WKQ%n|_TZgo#;ETdH4Lwv!ZH%SNcr)R2Sfl@t`KrK?eQ z#56MBG7l9Q*#WGMr#ZqX(&IlV&3u^h{F3m!=ga<_IEp!;HB)naq_@0H?Gy!C>}eGF z;ZQG$*(;$=ON7ae-w7i=Bu!MTCq*&6DhhH9AXcoBB*4zB?`rDJ)aA)%e^ELaps zT zCPCvYI4-?;-ak+%$W840sce+>A!pTx42FeY3ZEH&DfDI36#Dwu0XBpm)rZjAgB}{~ zVJTHdzN?1~sDyzYErS(%Hpkc?S{Bn)+Jj~v=$cE|@~6@smYhp#OyE0+k7iLnGHdwy zupd!OIhTlnqv#)H_sNoW(=o%uC&6 z&sq#|A2Q0b$gj@B|3Flu1sd`w1ITv47nwIG&qZ+Q`(IbZzLlUSWf@7@V6^m?Nf8toKokvR~42%s2iMBcoWP8IMosrb!P4PwahL&3)-K6AO+2zjF5cilo z2k|V;0a-IJybGv{Diw27g_bCF3%@H8Sw zCXgQr%!P!kA+a84h&m}#YQ<43bDsfjM&dq(aTdT&4lG2djC6DZS4O8yQiqvLaSuim zW8E)qnME-uL)si?!kdHk4==ujCHsx6%KO{bb0r7b%E}*c@ddcM_ zPe@Omth?~3+o~@|>Ka9@#A@l3M*Ng=Bi2V^yoYLJtL3{PjZkNQ7a0uf@6f0` 0: + result.append(str(i + 1)) + hit = True + break + if not hit: + result.append('5') + return ''.join(result) + +def main(argv): + mode = '0' + parameter = '3323' + mode = '1' + parameter = 'bai vvv vv vvv,012 000 00 000;bai tou er xin,012 010 10 001' + mode = '2' + parameter = '1234' + mode = '3' + parameter = '风调雨顺 1234,00 00 00 20 1111;无所不包 2341,00 00 00 00 2121;得心应手 2143,01 00 00 20 2222' + num = 100 + try: + opts, args = getopt.getopt(argv, "hm:p:n:", ["mode=", "parameter=", "num="]) + except getopt.GetoptError: + print('test.py -m -p -n ') + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print('test.py -i -o ') + sys.exit() + elif opt in ("-m", "--mode"): + mode = arg + elif opt in ("-p", "--parameter"): + parameter = arg + elif opt in ("-n", "--num"): + num = int(arg) + if os.path.exists("all_idiom.csv"): + all_idiom = pd.read_csv('all_idiom.csv') + else: + all_idiom = pd.read_json('idiom.json') + idiom_frequency = pd.read_csv('idiom_frequency.csv') + all_idiom = all_idiom.merge(idiom_frequency, how='outer', on='word') + all_idiom['frequency'] = all_idiom['frequency'].fillna(1).astype(int) + all_idiom['pinyin_r'] = all_idiom.apply(lambda x: compute_pinyin(x['word']) if x['pinyin_r'] else x['pinyin_r'], axis=1) + all_idiom['pinyin'] = all_idiom.apply(lambda x: compute_pinyin(x['word'], style=Style.TONE) if x['pinyin'] else x['pinyin'], axis=1) + all_idiom.to_csv("all_idiom.csv") + if mode == '0': + all_idiom['pinyin_rt'] = all_idiom.apply(lambda x: ''.join(map(lambda y: str(len(y)), re.split('[ ,,]',x['pinyin_r']))), axis=1) + groups = all_idiom.groupby(by='pinyin_rt') + group = groups.get_group(parameter).copy() + print_max_group(all_idiom, group, num) + elif mode == '1': + parameter_rst = parameter.split(';', 1) + if len(parameter_rst) > 1: + parameter_rst = parameter_rst[1] + else: + parameter_rst = '' + parameter = parameter.split(';')[0] + + hits = parameter.split(',')[1] + parameter = trim_space(parameter.split(',')[0]) + + count = ''.join([str(len(x)) for x in hits.split()]) + all_idiom['pinyin_rt'] = all_idiom.apply(lambda x: ''.join(map(lambda y: str(len(y)), re.split('[ ,,]',x['pinyin_r']))), axis=1) + groups = all_idiom.groupby(by='pinyin_rt') + group = groups.get_group(count).copy() + + while(True): + group = filter_group_mode1(parameter, group, hits) + if(len(group) > 1 and len(parameter_rst) > 0): + parameter = parameter_rst.split(';')[0] + parameter_rst = parameter_rst.split(';', 1) + if len(parameter_rst) > 1: + parameter_rst = parameter_rst[1] + else: + parameter_rst = '' + hits = parameter.split(',')[1] + parameter = trim_space(parameter.split(',')[0]) + elif len(group) <= 0: + print('未找到匹配项') + return + else: + break + print_max_group(all_idiom, group, num) + elif mode == '2': + all_idiom['pinyin_tone'] = all_idiom.apply(lambda x: get_tone(x['pinyin']), axis=1) + group = all_idiom[all_idiom['pinyin_tone'].str.startswith(parameter)].copy() + print_max_group(all_idiom, group, num) + elif mode == '3': + parameter_rst = parameter.split(';', 1) + if len(parameter_rst) > 1: + parameter_rst = parameter_rst[1] + else: + parameter_rst = '' + parameter = parameter.split(';')[0] + + hits = parameter.split(',')[1] + parameter = parameter.split(',')[0] + tones = parameter[-4:] + tone_hits=hits[-4:] + parameter = parameter[:-5] + hits=hits[:-5] + + all_idiom = all_idiom[all_idiom['word'].str.len() == 4] + all_idiom['pinyin_tone'] = all_idiom.apply(lambda x: get_tone(x['pinyin']), axis=1) + group = all_idiom.copy() + while(True): + group = filter_group_model2(parameter, group, hits, tones, tone_hits) + if(len(group) > 1 and len(parameter_rst) > 0): + parameter = parameter_rst.split(';')[0] + parameter_rst = parameter_rst.split(';', 1) + if len(parameter_rst) > 1: + parameter_rst = parameter_rst[1] + else: + parameter_rst = '' + + hits = parameter.split(',')[1] + parameter = parameter.split(',')[0] + tones = parameter[-4:] + tone_hits=hits[-4:] + parameter = parameter[:-5] + hits=hits[:-5] + elif len(group) <= 0: + print('未找到匹配项') + return + else: + break + print_max_group(all_idiom, group, num) + +def filter_group_model2(parameter, group, hits, tones, tone_hits): + for i in range(4): + if tone_hits[i] == '0': + group = group[~group['pinyin_tone'].str.contains(tones[i])] + elif tone_hits[i] == '1': + group = group[(group['pinyin_tone'].str[i] != tones[i]) & group['pinyin_tone'].str.contains(tones[i])] + elif tone_hits[i] == '2': + group = group[group['pinyin_tone'].str[i] == tones[i]] + if len(group) <= 1: + break + group['pinyin_0' ] = group.apply(lambda x: ','.join(list(lazy_pinyin(x['word'], style=Style.INITIALS, strict=False))), axis=1) + group['pinyin_1'] = group.apply(lambda x: ','.join(list(lazy_pinyin(x['word'], style=Style.FINALS, strict=False))), axis=1) + hits = hits.split() + for i in range(4): + target = parameter[i] + targets = list() + targets.append(lazy_pinyin(target, style=Style.INITIALS, strict=False)[0]) + targets.append(lazy_pinyin(target, style=Style.FINALS, strict=False)[0]) + pinyin_hit = hits[i] + outer_break = False + for j in range(2): + if pinyin_hit[j] == '0': + group = group[group['pinyin_%d' % j].str.count('(^|[,])%s([,]|$)' + targets[j]) == 0] + elif pinyin_hit[j] == '1': + group = group[(group['pinyin_%d' % j].str.count('(^|[,])%s([,]|$)' % targets[j]) > 0) & (group['pinyin_%d' % j].str.count(('^(\w*,){%d}%s([,]|$)' % (i, targets[j]))) == 0)] + elif pinyin_hit[j] == '2': + group = group[group['pinyin_%d' % j].str.count(('^(\w*,){%d}%s([,]|$)' % (i, targets[j]))) > 0] + if len(group) <= 1: + outer_break = True + break + if outer_break: + break + return group + +def print_max_group(all_idiom, group, num): + group['pinyin_c'] = group.apply(lambda x: (math.log(x['frequency'], 2)/16 + 1) * len(set(trim_space(x['pinyin_r']))), axis=1) + list = group.nlargest(num, ['pinyin_c', 'frequency']).index.tolist() + for i in list: + print(all_idiom.loc[i]) + +''' +parameter: xxx xx xx xxx +hits: 010 02 22 001 +''' +def filter_group_mode1(parameter, group, hits): + for i in range(len(parameter)): + key0 = 'pinyin0_%d' % i + key1 = 'pinyin1_%d' % i + key2 = 'pinyin2_%d' % i + group[key0] = group.apply(lambda x: ''.join(set(trim_space(x['pinyin_r']))), axis=1) + group[key1] = group.apply(lambda x: ''.join(set(remove_at(trim_space(x['pinyin_r']), i))), axis=1) + group[key2] = group.apply(lambda x: trim_space(x['pinyin_r'])[i], axis=1) + includes = set() + for i, hit in enumerate(list(trim_space(hits))): + if hit == '2': + includes.add(parameter[i]) + for i, hit in enumerate(list(trim_space(hits))): + key0 = 'pinyin0_%d' % i + key1 = 'pinyin1_%d' % i + key2 = 'pinyin2_%d' % i + target = parameter[i] + if hit == '0' and target not in includes: + group = group[~group[key0].str.contains(target)] + elif hit == '1': + group = group[(group[key2] != target) & (group[key1].str.contains(target))] + includes.add(target) + elif hit == '2': + group = group[group[key2] == target] + if len(group) <= 1: + break + return group + +if __name__ == '__main__': + current_work_dir = os.path.dirname(__file__) + os.chdir(current_work_dir) + main(sys.argv[1:]) diff --git a/requirements.txt b/requirements.txt index fb6c7ed..218fca9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ pandas +pypinyin diff --git a/stat.py b/stat.py deleted file mode 100644 index 779f3e9..0000000 --- a/stat.py +++ /dev/null @@ -1,114 +0,0 @@ -import sys -import getopt -import pandas as pd -import re -import math - - -def trim_space(s): - return s.replace(' ', '').replace(',', '').replace(',', '') - -def remove_at(s, i): - return s[0:i] + s[i+1:] - -def main(argv): - mode = '0' - parameter = '3323' - mode = '1' - parameter = 'bai vvv vv vvv,012 000 00 000;bai tou er xin,012 010 10 001' - num = 3 - try: - opts, args = getopt.getopt(argv, "hm:p:n:", ["mode=", "parameter=", "num="]) - except getopt.GetoptError: - print('test.py -m -p -n ') - sys.exit(2) - for opt, arg in opts: - if opt == '-h': - print('test.py -i -o ') - sys.exit() - elif opt in ("-m", "--mode"): - mode = arg - elif opt in ("-p", "--parameter"): - parameter = arg - elif opt in ("-n", "--num"): - num = int(arg) - all_idiom = pd.read_json('idiom.json') - idiom_frequency = pd.read_csv('idiom_frequency.csv') - all_idiom = all_idiom.merge(idiom_frequency, how='left', on='word') - all_idiom['frequency'] = all_idiom['frequency'].fillna(1).astype(int) - if mode == '0': - all_idiom['pinyin_rt'] = all_idiom.apply(lambda x: ''.join(map(lambda y: str(len(y)), re.split('[ ,,]',x['pinyin_r']))), axis=1) - groups = all_idiom.groupby(by='pinyin_rt') - group = groups.get_group(parameter).copy() - print_max_group(all_idiom, group, num) - elif mode == '1': - parameter_rst = parameter.split(';', 1) - if len(parameter_rst) > 1: - parameter_rst = parameter_rst[1] - else: - parameter_rst = '' - parameter = parameter.split(';')[0] - - hits = parameter.split(',')[1] - parameter = trim_space(parameter.split(',')[0]) - - count = ''.join([str(len(x)) for x in hits.split()]) - all_idiom['pinyin_rt'] = all_idiom.apply(lambda x: ''.join(map(lambda y: str(len(y)), re.split('[ ,,]',x['pinyin_r']))), axis=1) - groups = all_idiom.groupby(by='pinyin_rt') - group = groups.get_group(count).copy() - - while(True): - group = filter_group_mode1(parameter, group, hits) - if(len(group) > 1 and len(parameter_rst) > 0): - parameter = parameter_rst.split(';')[0] - parameter_rst = parameter_rst.split(';', 1) - if len(parameter_rst) > 1: - parameter_rst = parameter_rst[1] - else: - parameter_rst = '' - hits = parameter.split(',')[1] - parameter = trim_space(parameter.split(',')[0]) - elif len(group) < 0: - print('未找到匹配项') - return - else: - break - print_max_group(all_idiom, group, num) - -def print_max_group(all_idiom, group, num): - group['pinyin_c'] = group.apply(lambda x: (math.log(x['frequency'], 2)/16 + 1) * len(set(trim_space(x['pinyin_r']))), axis=1) - list = group.nlargest(num, ['pinyin_c', 'frequency']).index.tolist() - for i in list: - print(all_idiom.loc[i]) - -def filter_group_mode1(parameter, group, hits): - for i in range(len(parameter)): - key0 = 'pinyin0_%d' % i - key1 = 'pinyin1_%d' % i - key2 = 'pinyin2_%d' % i - group[key0] = group.apply(lambda x: ''.join(set(trim_space(x['pinyin_r']))), axis=1) - group[key1] = group.apply(lambda x: ''.join(set(remove_at(trim_space(x['pinyin_r']), i))), axis=1) - group[key2] = group.apply(lambda x: trim_space(x['pinyin_r'])[i], axis=1) - includes = set() - for i, hit in enumerate(list(trim_space(hits))): - if hit == '2': - includes.add(parameter[i]) - for i, hit in enumerate(list(trim_space(hits))): - key0 = 'pinyin0_%d' % i - key1 = 'pinyin1_%d' % i - key2 = 'pinyin2_%d' % i - target = parameter[i] - if hit == '0' and target not in includes: - group = group[~group[key0].str.contains(target)] - elif hit == '1': - group = group[(group[key2] != target) & (group[key1].str.contains(target))] - includes.add(target) - elif hit == '2': - group = group[group[key2] == target] - if len(group) == 1: - break - return group - -if __name__ == '__main__': - main(sys.argv[1:]) -