From 14e5568890f8153d3508833a507d5150b5efe6bd Mon Sep 17 00:00:00 2001 From: zzhou Date: Sat, 11 Oct 2025 14:18:27 +0800 Subject: [PATCH] =?UTF-8?q?2025/10/11=20=E5=8A=A0=E4=BA=86=E4=B8=80?= =?UTF-8?q?=E4=B8=AA=E7=BD=91=E7=BB=9C=E6=9C=8D=E5=8A=A1=E9=83=A8=E7=BD=B2?= =?UTF-8?q?=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- audio.json | 1 + audio.wav | Bin 0 -> 67530 bytes v2w_service.py | 95 + voice2word.py | 40 + whisper/audio.py | 13 +- whisper/tokenizer.py | 13 +- whisper_service_deploy/v2w_service.py | 94 + whisper_service_deploy/whisper/__init__.py | 161 ++ whisper_service_deploy/whisper/__main__.py | 3 + whisper_service_deploy/whisper/audio.py | 157 ++ whisper_service_deploy/whisper/decoding.py | 826 ++++++++ whisper_service_deploy/whisper/model.py | 345 ++++ .../whisper/normalizers/__init__.py | 2 + .../whisper/normalizers/basic.py | 80 + .../whisper/normalizers/english.json | 1741 +++++++++++++++++ .../whisper/normalizers/english.py | 550 ++++++ whisper_service_deploy/whisper/version.py | 1 + 17 files changed, 4120 insertions(+), 2 deletions(-) create mode 100644 audio.json create mode 100644 audio.wav create mode 100644 v2w_service.py create mode 100644 voice2word.py create mode 100644 whisper_service_deploy/v2w_service.py create mode 100644 whisper_service_deploy/whisper/__init__.py create mode 100644 whisper_service_deploy/whisper/__main__.py create mode 100644 whisper_service_deploy/whisper/audio.py create mode 100644 whisper_service_deploy/whisper/decoding.py create mode 100644 whisper_service_deploy/whisper/model.py create mode 100644 whisper_service_deploy/whisper/normalizers/__init__.py create mode 100644 whisper_service_deploy/whisper/normalizers/basic.py create mode 100644 whisper_service_deploy/whisper/normalizers/english.json create mode 100644 whisper_service_deploy/whisper/normalizers/english.py create mode 100644 whisper_service_deploy/whisper/version.py diff --git a/audio.json b/audio.json new file mode 100644 index 0000000..258fdce --- /dev/null +++ b/audio.json @@ -0,0 +1 @@ +{"text": "\u4eca\u5929\u4e2d\u5348\u5403\u4ec0\u4e48", "segments": [{"id": 0, "seek": 0, "start": 0.0, "end": 2.0, "text": "\u4eca\u5929\u4e2d\u5348\u5403\u4ec0\u4e48", "tokens": [50364, 12074, 5975, 44237, 10123, 10440, 50464], "temperature": 0.0, "avg_logprob": -0.5544378757476807, "compression_ratio": 0.65625, "no_speech_prob": 0.1877238005399704}], "language": "zh"} \ No newline at end of file diff --git a/audio.wav b/audio.wav new file mode 100644 index 0000000000000000000000000000000000000000..24d93142f223243357f6626ad789bbd95b93d773 GIT binary patch literal 67530 zcmd>m1y@{4vo;Xif@>hSI}A<|Tn8WA8JvOO4haE*ySux)OK=G8?(V@Y`AE+D-uK+I z?!ABDtF>mTo~r7u-rckJ%-YZHfr5e}Fm-hM2>Hmx0res;n%%;|7V=`!IXPK?bqWw4 zIl@KY3Jp!rhmhmqU;L=-Y%FXn9Ng?|FZu=Jd<0wAyyzFvtNf#&p=BXpL&q1}72<3T zhW_J(g4(W{Y5Q&drCUDz%MAtflJM`GKfeD4UOfN6AO5T1|7g5=X;9~AZ)o)r)p4{k z2ft|O-!=bLALf5*^{2!?>ipkzp_L$D3&WSb$ROZ9HU1pe-;WK{^QRKCIoR6%-!&a< zo&Ggfe|c11Fj&ga25fEeC+7)bVPpCd!f=56)#<+)QoYRaZ~q&xy~%I=TYe80)ydwP z>|b`LgQJo4iw$;gbohPHzvXw1>dT`39_VlRjsFs8D2(66zm|c`!~yK^8^8MW>C@X6 z4f~>TKHx&ZLqWlu{&~T@cwjIC;h}#|AcljZ2w$gAU?s;`$ONbF}}Xf2aPg_4hIVh94ldwtwabV)|cm^pXQj_p<1J=E(6c z|L=DHTI^T%FZPQ-!#dhK*+4_R{vCV$#{~uZqW=DwfJJ-3^nb;EzyFDo{s+JM@zT$~ z`uler`qzKrfAaq}7qBn2U|#zCxBQ7C{u}+(KlDqV|CZlz=(B&Lf5jjEfp7nVU%k|Z zUi}aL)Bd01`&&M#{-2Z6J*(38Z=?QC`~PpPwZ{K%4juXcsOSldaSh;@ zWQk8vo4Vw|D@(o>@vGtXa%XR}S?2l^6jT5!yhtt?)a!3C6i`s_ecmG6ZIkFik!Jkz ziialgc?0G3OUxRIwqz%x;rLa_kpRZ~y{!(YBO%eZmr$=2u02TvU(q1hvn=N}Gni^} zn)?bZQ?`q-z;C)TDiVupIH8S5tzNL0;vxj4d@x2a5PwUDi*J0KJ6OZ ziN_`kD@9ptEHac!A`E_>7kmFj@B){UiL?C>i3kT{>) zpt~LuDsBFpQbBxeNgd;iCfs~N!~PUmmJ^rL;1kP&;O81va7yp&{9H0)b7|?;)_eWw z#f#@DxB2#oK2oHir|TrgHs=#1o{bs@qoj?*{L!^E`milnDnGw*j2l|HXrlK-jBW+h zovDSYPGZTT$W)1$GVn?5$xP7-&_wg4Ge3m)$$gOYK)H;*M>^PndDFzm&E}XzsG%m9 zcHZ5$Yn%J2Q^{tjzUbO_=_-_SR`-$}@njc!{v%lpsimo%bG7b%Xv`>#e>8 z7sB2C{`J|#%xWGSX#%;RrHmXW`>oS3*Oy4?sinD?PuQNtg%ooXODI6fTGq)CYG&L3 zXwQaO+B!QO60oifHxShNO!6_mM30QPUrK}Sm?>WRgR*AE(5L+GCBbXV0ht*&Y9s+} z8m~k-D6EyLuAG~xEY%s}lIagG;YXV|@!XXiV#icewe@DO;bl_&^|1TQRFpay`s%fQ zE{e#A63ENxx>x!~U0B}rRTv74+GAW3CUkSyTg=MwCzg+wimc4#mS3#?yhYWz0k0zG za!v)ig@H^r65Z>M%gqNEtTdsYOwp?gW<#3@P7bF76G=ebQ5O8QD#%V)wtlV!9EKqj zp_K|E&Qt^bPS%|Dd=!PO@=Dl63jiXyB*4L-4b3|u1D!Ux5-wBlzFlPBNX_WCA(i3$ zE+T;}dZ2omFWa^BJ2d|zlwuw)brj>U+P3^E!&*npxGRPuMBXG6TDg5_bb>O~!W}qs z50(<;T1@*5due4waG|QNr-U5te6K}p3UQ$+OXR#ZSW@&NV$tYHKf&elC`Qo${Z7|r zBrm^4cP}+uzC&$bgR#~f-tcgiQQrS_2;xL4WbC2}BG)vlo4wha9BSJ(r@jP>ja^?n zc<{Xn(%m=`N^MH+dmq!+Rz{J{oIV!ezfAg0na~m$xc`KFv^d>JZk$uTJuL?Rbn^qC z5vQF7WY)?4XvN9%BqZsX#(SMivnsvV+?^Xk_r*I;rRo~8UrIVB_x7vIb67q}tJKb|j-jcvZC-{@XBMPVNSx~$$9A!!geWiLwd1D97dGHeRsaRdm4;!;t z1KgFBVyT7{CGuw#wWyFK`SOX(ud;8n&9e=K z)Af@gsWwV#qNOU5a7(TdW(J(H9ObLY$}>fP7#mryS2yBnIB5*oE3-ZwX(yn%%p#$b zWa7-H&Q+q^JuUizH;IVZ(;u~JI_zvIF0Cz?u`5$em)cO!wWl^c$$E(?VihxAC12}H zySzt5ARL9W^{mCe-x-G~{H#)9X^#;q9x2m;01fL}Iy2zvBhV%oxScj<|>udlq1MWF$K{RbRh}HB-WCaTS39!juza;V)FRp-&s% zBC&!#%nf*R6Pe9XjhEi6sO4&YJXqKmxb(%v#gah}8TauKYL#$_7E>{S}^9&z=f z<2fu>34O18=n!{1VUmkKBsJ zMvT~BD5T5mo0G)87TaU<%uIsE0#Od{bOG)90tA2(T5FpXWmQQgU)1YA%pk<;?J8Ofa^yIfodf+s)kN^+-12m; zH-3oumMTH=~kP&(!L$@gc6Q6%}DhsC-t=Wv&GtaQ=jgLaNcBp z+l+~RqC;8LJ!qD^6zb>zsi87yiP29zmkGG1uf&0H*5U#xs7V6 zhlxJ>4mRpilI>(ptH(nN*EVB1+gSH78P`l{Hm)4ez+{mYZ+|rPGy@+(9Pa(%yKI`i zETJtpwATGgE>aNm>-=obrpr@0if+lSjcsxqot(2mms9C0e4DMM+m@Yzrz8RElR8#( z$1E7Qgc;nJw#@vfUl#}gy7Tjyl-y+!Z9CV2!?7H{f-vdI>^gfb;=8|uObY5J^Xqqi z{HW;z{v>?WuX{gS*0C0rrjp-A%xd=KSI$zii2TT562!{jh6HNt>*qqzOP2?uS~nqi zPrjao6EnZ=wpPNP+}%XAd%4=gH}gSr#W78{i9i)= zLClHbbWrPcY?Aq#4h{G$rUw|XLio?uSMNp|%i4M2*YKyBHp=HU0=kR$wgoCYh-Ua3 zx8+f5hSa~s;_J*E+jmhSz^x?Kd|Mg&92V-4zc7cF@uo`2KZ3R|5#Yex7c7NLzt;n9 z+y~*DsNXB^%Hxn+VsdGk- z78TxVb{LHlu67wrzE+YDI@vCX+0{ZCBq6og3O#}GLPFesaKc*YkJ~q1n!0Q1rrYnZDS3*vqd2}_O=uIex%Zi_S4C*W>2$s$}+D)WXLl1)=4)! zd~~kJHfMb0Eh9^LEaFIDoCOarNlc&u;%p$l=@**oWsSIIb5VUF-LJZb27`S|!Y+A} zlYs~yH@Xsm=EsNWLnddhav)!)&@&+Fk_ov#SH&w(i9AWQIipJ`g`BIZ;~=tfaPt+? zgl&$sS_*^#)Zvc#%^277C(|pB$>n>D_$P;eg%0EJMv;pF4+8xo2|*Rk^N`ckx>ThP z->Ie;jwtET5M`_I;vY$W5l4FlxH%zs_u!{gw3IhH{4n-40&3mGR#ms}RScJV5WDtA z$TnO9wW`O~E!H5j$KSoAvy4$|vRC@8mk@0%r@j%bq5vTY{tQv$ne{C8dE{#(FPh~B}s~M=-6Zy?F@PivK(f>5Q4@Lnuh#Pu zA?}4yq4dGK__|EX*{k7`nKAhap;T}D*0aNU&xnq}D2tszi@Aj5q93cQ-SwkS2Q`@A z+>{wk`?wBM?8s1Idm#=`Yaa&%CWH%6v~GdsUtnb!vY<<_N(hvI6O&7-?=Ew|)7U|@ zgPGOS+H78;?6W4htZkI=^*k*PG)F;oAvFu%(Pk8aWf;5)X;Kg1_^JEc5fsU)yS)mp zs1;*NwQS+P0iq;qNqA9;7?5xnFc>t}wjvjB9Iw0b1#GWxjRZCyfr|T?DF`ns9=;+m z00G&sOhN2Zc+G(x`Y6Dp_34>CoTLo1Z-*aMoSt9z9L{nY#1?2*BEsnD)Um%0NOSTF zk=OjHGSwW$9P(EFptTgmGGXSO-cw>(c2S%?1{eDw05!W?fG=qw*ZGWTV24X;eDRHl zqDTqQUiPM}a4y~?!u2Qbz^uZn#a^ZYFPutaO6}r+%(BO~{*NY;AMt)zUdnX)8`O!% zmYHK$3U;Lh+kba%l?8pFu&hYERc#pLq?W>3kx$h1Q??5%)Nl+jPrV=splB>I`f?Cv z*Risl%P?Pp)AQ(Hxo9QA>!QNz0G--`H~x;8If1dNgMqs5Yp6$OsXD%;)Ugh^;BI0% zju_1v`Pls+I$BT__Veg-_He`%sefz%M7!z}H!b8&O53WqC1OgGDba0j?JT*3!3Gr( z3@e3;D(6R3ys|yXs+3hav`%B4L{e($KgFgk5nvD$&(Wnue@i|kj8s!gu z`@=T3CL&Dv>!X$MgTH9pG<2$J1H0~9N%`1t79ClIr6fpHQFPGX;^p$St*uz^d zPpuiA4Ab(5k~qRov2L)D9Ll|MY2}uRw@lx)A}dFQc7bDxcW0uX3`5mY1n_DUCo3T5 zjIg$%C8*)P`QHITu7i^8={s%q6DxknV+LClsfiU0gROacCU@5d?JKU^%-Y5hwoK8B zJ)I?NIu1&%vdSogdOVbX2oxoBFK?Iq?I(|2dvC$h?Yq(FkP>CCqzxt8v2*KRUd9k1 z*Y9VxxeC_ds^1BDMv9A4>x-XQbF2}rX0ED-Y-^BBYS&|ROGm>p!uLjdlQN8Y5sg})iR9{p>}GOcJ;C8$jc79e}5@CBTK;uhl;YwD0s|_HPmVFNpgn$ z^gd1B!%CFrqQMKZTrPzf6(LwDT>QXFRzW~k`m-{uuOY0FCDf8}UY*6?cKBq(%@0kEL!bRH zs|6+_Z7`x##fmOSc224qP-~L=>|}4I!!d?= zSX%V0v}a7L3Alf3*}^OPJY%>+OOA)s-~hpsa?;09a$xl-sJAwb=vRiDsop74g2cSD?s&C-*n15JH=!Yh=hIK9%CW+Va6D!2SHYMqH+ zy)b$P(xp}sF8CDnpO@M$mg_QUE1G^KgQ-(ZUTb-3F=A#Pd2Wl*4sd-ld%qhoQ>WW<%X3=6oO;@yUu)Ju{XIRP-P_u(^T(w;E&r&Y z`5?wT`wX7CuFZZ|$NK!U_T*O||N6&<-s(ar-pY3+CySgl=aR$FM@jugKp}6zCs{Yp z8tbbt{?Ig$5K1C-!C&!y1k=Oh$3YtUHDA7Z^KKt!*mRS-N2SJijk7iC;^7&}!-v5m zDwEvh3O)`+=kE_huL6|mxp>m^L)ptz5~(LTi!6px2#|G3THH+HV?S8DDO8tRw zXb36y-b8SV|4{=?o-cDGRkopp5$0DYMmwAn`SlypRr-zS=Nl|87qNl{Qh?17yhae~ zqv^%c9A8i8)|1!y{b7q5^}6~-264sVFEbUX_%BGMTqC1?dB8$BdKxi`(uFrNkagZ& zQ6-W6ooLQl0=GBZva6APPTp&18pEsC={loDUa|F|C!<%N^E80|OBh6qFW2{n`Eqt= zK|~Z{IG`mhMjHcGjD40eDAYZKdn{>vlcI8Xr$6%IpjdH&vSvf0e=*Ih&#}STNy15* zKqBRG+aZUf1G~%zyT1i*(N>3KCMWKo3*c!PNvCLy&XfC<-4Ajl0z@)HGX?Bn+{;bZkvT~qn*)<5mS-aPwT>8$F`sZi z1Btk_#$7Bn8@K;fp8WTcT;F@S8f{9Xz{bok-292P(9#8!Z6bYKv5v-4k#=$+?jaN@ zo^+kuQ%SG{-5s_rtUEL(wOCjS;gtWmtWqyIvV>wum=gdCOWDp$|zPU z2Zn-jbaAm5ic124QIir{;1WwMWeU4Kz9HZV?>74NbUS@WO&%Sc>av|w7jGVTcy2-% zjzbqK!ifvAmdAgMRjC===W0k}I#4w{_2bTnc7NepV8=y;&D3=oi(oL)?j^n4`~OE|}%Rsu6X8U}={%cMv_>Dl8)Va+uWB0X&6%h%4poE$eW!$Fx^XgPEn!2P3+Z{Sx%8PlxE?7elSURn z$lg1W2Nj5^MmL8-ly9h7HLwrhTlD2gt46CmdjkQ}v*hUe29&7i%8XKSGN;Q(?4Jg4 z`LYw31<0C-CmiB)VMr9N8FrB5nNpe=64w{X`o#81^vj9lY)93!6v`{%xlk$0jTZHa z^UjwULn_v3!jN+YV6o_9f}A4SM7r^DbS0wd$dOH_rl~z!r*uUlkHAGzQ-?g|W2wczYV@0r42_cB94fIwjeek6vzr({yIr ztO`j>`W941Gpf-5TL!9Ys;s0I|&~UKwvrqlmzl!E?yJWT-roRoE##p`KjJR2$b}Wt!t- zGI*5disSP1fP+%1!gs|newe)N1)uQY5g0)C({IVoQvy;C5G7*Q*&#DNGKzJf9Ha07tz3Qd^Mv{AeRu7&9O+2aFj&ZHoZya#CU7#2wWT`3&JtW>z!~|FlNM z#uL^swcHlGq_7noZ_itYq-yPPLvS@5@i4Y%xdc3LO${iR=5*NVpBN4%DZpG^yg{1Q zSXuH*XW*@Qz#7;58dan4*>8!x_()cz*@Lh!kJ>jNS)gQ&UilDpWhW)bsT$iwdO)^U zb(?2LmI<2>nZ5Al=6OQXT&gE@MJ$_YUKij)O7S+2XoePp-!W)IKQWErfjhLX95h`7 zR*~6EAjQc1yk{gQb3H%R&4`NE;DM)y+dk@WH9F|JsyZ7~#7w9F2jN^!f$;DWR*6qMY|J8x%AEw0a3XG$*YB zhtplVz7lbmNWGsd36;kWF4lU*m#kMWRP_Hu&%f~N{}(+^nfMp;`oGfiWdC7i|Ipjp z|G+>0JN`SJ|A!vG{zshhZ}|741G|62FDCRq^RfPW9Ohq-4*d518PEBDk3-M@1OJ1* zhe7^F{AJ$$EhMXd==tNeUNA?QBNKu)-bI;+fEOfsMenXc~5ShsB7nM zNk^DdVC~{dl`sOsD3uHNp1)DYAv~|Twq)tREp{2iqU^*BtCd_-Ll=mO+i(ZfMK@Z5 zZGYytgG&x8eh~cRA;g3%fzBn{YWZY0;)l*)C-m_AW{)Oru_nCUhhwv=leyB zD&NTJt2=^nQ=o&aO&b&%Jr|HC!#n6EuI{@RBzosAV@D0(ucptvg(MR`vIdMZNu+kx z!PEB9vxk5F=7Uk*|MYz5Q~ zO=))s%CliJ$jB5K_=2P2iB82@7z~%H@3&dIy0_AQw)ud%-Y(+n+LR1_(Py_iGuTeR zK6Kk%6pwmD$4>UV#e6Suy4qi3a4(5Md?e(7V0@aAZj}ONOxCCM?Z4b5t*EQMS3CHH zPb*MV*_^*xOC>&qX+c{JAh8Z9=O@Q{^&aE3N?i^y59FO+>%$H4ZHKAz(Lu)`>%{F3 z3e+g9Pm}?imwn4XO#4YgKUf)5tnNexLCiUJ4MioS{tpGvggFdcpr9j{|sFBc7-&%4*z%U70aBYrs{uG{Z_I;98G{`myDri!(^0f`gxv z!MsxiCV(+IX$TP zTsFk17I6VUOT#Fq2M(|~QDZ^00KsL@o5EumABtwAk9!~H^D$Ft_Tu;g8ZOw+Ij&aT z=yF9sL-CeP-(^x!)cGWx#C^MCKj|HBR?S4@$o{#tn3m&`@^1gE-sElWergPr!-+sR z2P=a1-Z>kv;>XyiC@i1SE0=3A73Qk|#mS_d*p(qSlNzg;OPjKh>XIHa#qup%P-;Hh ziF|4o$(^_m{C?ZJIyinjeUwBk=9uJ6q^ zZtcbaWyPUhL4qfb`(r#)6J5S1A0jj5N55$%Fl#i)e#I(;(w5+U^LZZ`%$WEvaAr_7 zDDLiT?mPY=WYO|4M1HpEiWxgMeGR*>v0=QhpJ*wAzhcrNc}Nown|wmw+UJFeJ6cv; zEd3;5xhj9=y3;q_Mj5>3on0YUjjl9=;O>-Fz;*n(JtMB@2SP0qG&8GbZ; z8rNgCGdCJ-5EfW<{J?8NaWdf@w3J0&Zyme6+TgtxOcU-~49rgx6fUZthr~q(Ff&@- zJHiSY_yL44TbX7py#(_3dvO!i9YEi+Du8Rx>Lp9HKHtxSyAB5N# zJHwR)SJz=!CYDRdKqIY)!4x7$H9OLqZ|5yl{cB|0O@{M}jdnNsu@=FKyV8VIO^euN&LqXKtr^?RK7QkFh5~xPw~S!`32iyeXAJ#rA%Q$I=^0Ez({> zu>x8R-+_17;6WX~%D`I_JD4Z&=YV9==N!f48f2$5{uKKi0`0@rlbSrpg7%(KucS7F zZ#7;m!4P8GX1|K~h-y(UcY?5$<(5?soJ03pLQz+6;UXYog*`0$zSKxYq-Bdff>r9a z)MacguG2KvkSY#AD9#K=YA#7!*>~r(ESKpnY>m_9i?UsUAB0ngt)@}U;aB&(AdIP~ z&vWT@-$Qb@@@-QIB%c007vGJRNg&a+v7x~#$iWI~*#Cskabd~jk(5Wd~C+K~2KNRug(6_Gi9bFFl+Y0W$$ zNTv2z*Zl2LtYBEl>yvT{JF|Vo=b|~;HQ2;}K^fg~JHFbmJ*c0qo4sNQ2bU3{hD`PG82=iFzRQ#dAX zxg$Xkrb>?!>eo@;390nX$B-};<`Fp8JE9cNzVDtWy>P|7zI8Ko9t*)W*7{84B9e4Y zu-Ou8Dss%JsnkFGlBU!fztuQ@(32})$nPXhL-78bP}R63PK7(7bA5}^l}7N6vEba? z;%6;Ol~;?A(Sn9-)qG!1@99uwI~mLpyxOqoIIB{M(i;Xp&f~f!9X+|pP15r4M{5H* zkx9l7Agp_r_OOgTj0ko~zJ;0M+I*S=&AjZJ*K%e;e%ijH`KX8Vq3}jzWlpUnK|FXT z$FVwxE%fu`i^>flk$onj_Ou_()g;OiA@bsa&-s1Pt6}}x%HLa)PytEkdzAFNnYGIb zmgkD!fb`em)@8?nX%LumX?O-+3I+L3#q@Hx6?0*eY06PU6DPc@B(epd#OA#9BGIZ0 z`C^$BEU&xg_1o}NM1Z^~+U&b1b!Hj5lMX!hS4O>r%7=^_G&dAy2`u0R=^+_51%eY1 z5&T3xX{yB4P^C(u9_AgpssOj}7o%o^`dFA7A1kR$EK z(B=>TS7{L^CtO5Afka7&o$puItF3kphlRXoa3->qvpd7Kr-Vd_TTn6k= zQaLJ=XpPx{g_C*xw4{ep5lV7$WEal(*(PQ8dEM4Mb==L6;fv#7rrO2LZ;kc$&pG&? zvLX@9k}wwVU~i~A8tNCLszAigOz(TRZO#R;NL~aADl{j^piH)bl>L2U;5)-PiU8-FVoMgs=$+n zA0KhmhOsQQ^nS3Vq0FemDZ{W+N3*vXU?&J!$BM6_Fb}JMVlXFjm5h>Y;-Bi(fR6;0 zpI z^o)A`Wjbi6jl{-q?mXS3(wXU9#=!#RFXKjN*))bLeqc6H+(12=4|TAIFkeU^-*m!! z(a{fmI#u=LL0a_k)$IgmtDgPyGBqv=c;{;KS>Bu>f~n8Rg@rLMKS=N2@gQ!PvQhvh z5(taS;w-C)(wfQ!^RVJMWGW#iun&ho%_{MDX&AXYHbpVr{n>dVh+V^R(FZEUa6+>T z3*k&!EV<3BhzVJl*u#r?{Ay%eMOv`P=8!I0-70dx1mY=6u2CF#8W(XwCUfMwB#MCR z^-Rx9CRbT!T9BG5XW%R57?9a&9!!zl1-4RI)mj_rP@B!nDkOGl zdqRRQ4--$$x5PBuagl2`InzmXOGZK`w8aC{a6(#S4ug7n0dwJTqyya><$`$5d|4C-*bF}Ap#W6}T;YONuUB?{+hHwa9~gjYTC{n4YW>LkgRCs=L8vmt z>JuF7w?r$HjQ}~0`tN5;N*hb^;VU}>y*Wgq>9h>6D^nteY0^a|1HK;G@X`@>c4;oL zAWWes7w%qx0NOT*Q&pvX%a0aiIK?R-!8F7MpuGHb+o45x_>@%$z(3D|o!oX~Ad;bl zAANd3Hci_uUUPpnoE>s%rapW?z&xnhiYjWN0ninq5n<2Kt7JnRP3BN%)#*hTQo_Ncr zH7enGY365>n zmc`Zsx|T~4GYJrx+bYmP8W+$mw#Q5SA)$TGJ6#|fs~qBEq6FcdZj+zQ7Wg(HK5NiEe1Q;^ zS!9qqVSzDI$1p`(o1&?XCFUMarw?#moh15kZev$6!V2UI=et{1TNV0{gerr{qTE$T zlfqu9J+D!gg3KFF5_8PHnpY=L$~4VNlfzwo-H7`8!EU#RZ$^IxS^&8W zGHYBLu97H4yI8NExo+s`p2X64 zEz^d&CH^&NM1qPaB|A2lAb_V_Lcb!JYLRlJ3F-P-INQKngpO1=y6r9}Ygo&~v{~hS zpH;kQL`+e;2c_Cj5pXKq__o=P1aR}Ku_C2#Z1<-whg3h+wk$=ZBDeiBYsd_dLRI_Z z#GscpO{dTuYlU4x;tj6mAYnFUjIab{efAH1L&x;$!OU#v{LGo{kYWnNEX6rlZ|R&0 zE2}SsB1R65EO5M0_C0+S@d8$@4xCrNUX6v~n%xVOk@{?pVD2Xku|=C;SgAiJfK37| zY${ox>O}KV%1#lr5{cuR~ow>T=uO%-cKbkK@PLQ{O zK<7bNvABw6VyzCQLG5ce+5s$S`4N@$&GnfrX|1Vp^MrMu8J-$1tHN05qPfMIRBT(* zznv=P?lWY?V278d?OHBlB@(Jj;r6P|P!A@_w0^ZdrFsvBt|6a3s8M(`ybts&p;IZ+ z+Avrk3Xzs9@JAk|u|_Jl=HZw1ftDi8WX*_lJ8SVOYi@>`n^k6Wx~a6%t9)9dhgNYtX6et{G^ zM}8fJ&3U~CZMDQGap}E_epJl*9=j|D#r)N)RPf?dY*BLt_w~UeQlNSQdb{?5!r;)v zYgk_mpLbv`G_^8MSu0B{K7Ok<$^5C7_P5J*(=J@J&TXUe9r6moyt7T_PDhp^*TsZ* zc72L8*4ofaSbzkIES_D`E%Ojaog4Q+Nr|0vo+vv26gk47Z>180kzX>|G*nmXnzGO1 zUYleY_u)W#m`S;saVKoN9(;HegMw6PWQR@24eR@Q&4i!# z>g=66NKq=)9#Qp5ajj=cB<|iKif@E4uwBLGG4=af?_mpjx7`c4vqs2K)}%pqhDrUlmqe zK3i4#oV(a@BJ~TvJX2AWs2d|+4tPhGj@e1Bl|AE&H$sAJtmO)>?0rmT*IPvaj!(I#mTfPme_3qeX7{2) zRMT+w)_hT4BSQ=l9D_!eC==2AFUj&->ISftOul4#Q^oyNn^z7;d~Y86^3oee`wY`- zIc%ewK-Z@T;bHBf-6l1sfc4V84qC;ZHYy~FJ5tLj>g_7WR^#-URWlNG3EDL@#j*Q! z3$w{EEc$0tU;QB)EduHM$xSJ_mbR0Pc=d=lJ606lc@q4Yyrl(y{|-F^Qb& z_FNYTz?{Q6S0&~fNnoUbiA#oo)NeKyQ zKM^hf^9^zUv#gC5Vbm%Nme>%xg>Si@5VvkXH51R#hB*b_fT^}`6ry;5!zuA$A)IeR z@q8-cWJSK#m`vQmUi_|!WPHdR+UMM;IPV@rthD!UiC&BNH;K(JapIKZ9Z#;-CJ`Q@ zQj(pE>jX37M1wiD!=1?W^aMlL!pKHe4O#At;YC)5-CF5pvYX1&NZ^KG$U3zQ0K+mM z_#;ysr`Ar58gajvl-$gkQE(tL0aGw|qR7%d=2L3I+t(D1AvJ@Dhw8PoNP;iBCQ3mX z6LvcZ95FU1aXWd3mf4XEAS49EaU_W5#;+e4miIC~c}B8pZ8@%`J}jaW4J0mfKVSFG za;8q+Ht*6m%^_8^d-qo#UBA_zDQ?s_3!39Z^{)T5E90Hgw3@ux2@4|+gFwH<$~{Es zzCHYvP!sOMi2p-h9aVV7YXYf-x}nZ);l4u;{iGw^@Pej1 zkBr>CjXj0w*B0ZZr#augtrV!R2!VQMWpajRjDh6zqJhii*U?TU30YCb2a`c8g=q-ZZ1L-Oxz4r@M~<6G)ZyNZz9P15&91cIFDoQ5Uf^GRfDj=Fo~na~v+s)HP(zJ7Zc5s3xa! zvZlCtl3GAi<@bPJ{2P9$@C^MZ{_$NcYa7P)W3=re*pNiGj#LZb51Cg&HmEcYDyHa2P=eSS9Kvj( zu~mGHpI^V1t<70|BF_l1k5~x=!qNbADKoLQiJbiV-ZYeeHQ%G_#kN2+%C~mF?BI9d z?zyXQj=?DetP)SQ_#Gg-+NPR}$XPtv zwQ=!<3$&D&+Vik^0GMeNkf5{Nj8ayP3AqnFZN$g9@a*WU|%X z>^*7q#8IuRMRRqER?T>(%JJPrO=Di3?bcdPEgkO-C$V4KteNv{{Ar~^SMuVi<>10S zA5MmJZsTKqz6k;JvT9*l2b%;9Qtd(Xj5h;F;POtBws0h6^T#AJnzWcaq$z6qHN*sZ z3P;G?2xCh> zZxPDIwP&!|Q(oL~uMWB9%J-#ki;H)(FWk$bD_xb}IGQ=!0Xlv{3Y=AZy@c2|-{ldx$~?c?Mh zrP^}xJc984;0@0)V};C(^-}0(in}z3a*rr7p1g6Q&@#}pjt2WSh{3lhURz7X-JgVL z5M=yeC&%K42HgY0We&ni*s%Dkq!pxX21 zx-X3Z+|4Z!6z6-0KX^6bc7|I546N*1?e3l&WGklu$zdQ})WUddDygbH0ArM@T%2vD zxEpWj=5=iiW@LYOI$^DR2tmPCZ{iAy;E|(cBE_hPhRLR+h#fvzwn9iy^UKa4fOh=% zRDl`bIQMeP=TRqXj;bh%MTmX48p&bo=9FQO}_rQ@#enoX^=5ryMV+_m}5sO-L zX$Iy-nTgAz= zyi$)-L#UePb31hsD=?q_7k2#jKNkESo7Y9^|Kf)KH+CHQrQd(cZ#Ej{@0-hiv)j-A zj{b|S{u}<*umAa@fWMr7;&lH>_`|OMbMyNCf8u}G@xRskga5&X z|F`k~-Yk#wkM@`U!K4Vk*>Pypm#gT!2|R|jF5F~M31f&3P*aJKjN!m&p9DGW7YCyn zy~5^Wf1}dsXiBE{K}S{Z;ikC5yvnSnqscvGShjXIMtWL0F^;*zcDmXV`qySLpy1`b zs%Y&+hK=BJ?OXP+b;tM!QLf?TfwwF?8D1YRw^)ICSsjI}O|2q|IU=t{D)4o}aU}hA z20)n@YDr3%|}*rEZU2pC}ZDIYc}#I`;WcNd4x<535g(0;G0&x4tN@n z|Ehd17R`GfvchQn6PHqli&FdISCsF=eDzhA8Z8ek^DF^s=5t1i2J4z+%e~SG#;mwS z`&N7A;a<0P!ER`rdSgfO{wh?94i^oZI!pQ+$c4cDGR{px-;C3OrRju2-d!_m4}76K*Y*@fD*5 z)K6|7Z&d`9sf?)b4@6DUBo4s}?p}lo+Gx^7&fHpPRN-5Oy)K`qXI74$%CPOy-g7oZ zFozx`0WYEMet&~9!@m`VYE*i8e6>7*S0=`VjR6xH%6Q;ANE%yjy&!A+1410yoJZ{E z;e;x*%kbOuj&(QN=En3dz~d5Mjwk&A1(Jt-qxyGa!A`VRM~|aR0CUf<;0|v;8;dpL zH@nvZx{rp!pW_;tKO)F87h3?IaY8hgw zY{~RW4+U>i(zL*Yvp+E&1x4cr)^wg3#g@DC>7&|AbjrK%p?5+23pqJk5+URP@M(N8 z7L(%*V|v)bsNbD|tiu_jx)sP4xaJE7$B6+?hjv@?S#-DtT&h&sUF6YwXFprXSw7qp zi{z|s=9JeX1mR(UW>zxbVy$`b3V>m#NvUd!^?AZ3JnzB8ECLf7>D^IF{i_f8H`~ja zdzvalY$nZ3paE^;g|A6mp&@~HuUbzT$wcL5mn)1#^z*)i7C@8a7gO)jvDxEsq-&{vPhQzPxDO@;d2Wt*wX zbyp`ES(_iyFD8&MsInJ_;qs4)yq9B^zf)HTYv2zGTyhFPyd)j6y8a+-I0HX$%j*a6CO&`lZ7dRAOkqTHo+zM*995asm$0Z%OY`^n7S zoZEA&e>(jV4?pK@-!-87gLVlV_(%oaDNtw2%xL9Oo+?f=P2b?!o*>_`a@`W8Ovk;- zi#Ha7&t0G1L-kw?Z5K-x_Yf1F!*1fIrwk4qRRGc~mru>4SnLe77vC21QfK(-uat%{ zdBJVXQ>%G0Nl%NJxexq3RD-*druu!=UzGNjW=GT!=Lbn0#@*PJTsTo?8YLr4SVw3S zN~5Wj^@R?%Nr??-Aoaap~+70aX5RSy}u$OIe{A`>Hu6oGLITaw}gvrtRvr^OklnDVsXI(r6{4*SukMt@$hy_vs_p5cu`cpR+VW$90ky+Gvt`+&Vzp!Nt2VR^OrT z)x#A;%kRWIaj6Bp3#1bgsIIFOfP2S$F=UoN>jIw@R z&`QK1pFJUbb-6n|pt2SrQVJvPk9{v(+|C&0&YpT8@(y?e?Bt zq*^nj6uAR<&pg@IA03R>M@bdR2ob#HlpEJBIBC?YS57F-pox%UM9qi5RXSBZHgDjN zd8VA7xquN&HKp~76a+KgXF$lvPfhG{a5xH!@-MYN3A7s@2omoDh&-!P$*f1$CW%8z zm3VNEe^?j3n=e`En6x~;4j-u~U_qM6>Cj2hK(@ZjM7aM19TX_G=r^+0ZR1K%a`(b28SrxoSF*k?y_d$Lzu1F|Z! z2S19`>}7JCcWhPB1|9u{4IP0y27fND_&GL)Zhrl7W5^KIplE0Bl=66Tu4)#HGlYZq zgkRmP!ZsW<#01vH5{2JpNefr0A9iQahPFD4-;q<4(Ix3Tr`ItbL^m zRXD*}jgiFefdpzhEe9^dtqG|#BoxwU*49A}<1=`6|zCVc;7QnI4ENMJUt@Q~|111AxdJeLu{0ijJ5iH2nEv&+y7+v;v_d-*J zV{Y)@VZmpaA;n#h!Xa7AT1E}c^18?sqVy7W(bBL3)i!_z(D-j##=2RaZnh>dVXlvPfsm-q4s~3F zw|p6KU*#&{_&W8tkh!~^+K60HJO@eZM_}rWkJ8gt>c6Rl=Bo=(Y122L8~pM3SA_cd z?vsVPelwfTikdA?D%y#m)amfo9fj0M1>2>?nJ(Fc1mKRuA5oYJU4VN0wVioNZ@dQh z(KGcbWT(~KGLV29_kOHl$>AbK_L(uwVCaN_*}*+_Q;k!F<%WXf-3VRnJ1zt7(`Y=+ zoY)bQLP{r{YnR)yM}WN1Qg16CNrsR98n89OyVBB)rnJ_OmeKzvgF%f4r^-Na=e=s2 zP++0W`FRy3sbF=@is?YqvDevbQ6>z`?ZH=ptqD_6t&l?sTBzM=A0DnsD_KUpkmyMm zXO7Z`&uGzupQ|*X#b8=BY_=s8N!ih1!ZMEY(BuT{kYftYVij(%+%P+hP2`!<$iwWH zewj-JoN$<``n5baHPI=j89mpCNgPr_fzMTe=JXR4kf^MmWEH)XCma-FMzKfW-ikmC zO*!W#g0hL$jTI&!`4AsF>*;2-F2yp|u%Ohifs=m6nu||=_^Oxb!iiw4%&YbEOSUlL zA;E__5XY9kyMu1}Ce|QXeIi?-yR_8+f+VOvXh-BiTVaadzd=#$;!X;XMzsyU%er4w z^Ct2)S&4SsI=(5o^gw}^-6HM3#|f)DDc$>89N1qL{CGp@Ug*^~Irt$xYCcIiv5EW! zSYF{~)3()&32hA;RT5s0X7)d^{}O=P!=}UihO{tQ*5xPOQ-kt!?)nG(no<(vrbRs5 zXp5ctwPB%2rr_1wCBPz2@%*B#VhWjGtebHF8V`zLV+)B&iE29;OKZKAmp~Pp_v$pA z${*i?DJ2j#mgOKextBZ zu70g|QLHUIltBO5i`0%fnx&7}A%;s4_V9<7jbaY{owKvuTmKlOb(Sv7aMH#WF#fW9 zetBK(;?C2H8DYu!*Vyj%MB3Kh$!usm5eMQoub&TRTfAS4L5&6GsJv3m+cR-XMRm2g zT6=jekeCBf8@c*B8O#-!JlHYaA@4{6m5k$SCm@T+$Pzip0S)*EI7Fs2W-mdn^>#{L zoppjILLYw!V7&bdgW9N}hMHJ_^3b^0KFx@PHHU1BLl$>%~^6Q-Ez_{QOhv8z|?kbO%vW)vyYg*Rru>63{r)aQBjKlHOs z(Ue!dog*?D+jS8Y2U%PV)%KZ5QUgPq%&v5M17=2PAdt@L^u9`e49_MD1zpLRNU<_1(aT&H?C=q|)HO9WxVnHPmNVHSU4{Lh zek*a`gInv7GLsY>9Ifnjw%%hJsSW=?!1L9r(IR2Ojs*p{F4Jc>jMQq6bl2Q5tlO`V zb*Rsn!S0qM(N`SU;;bV^g_-32*4w!eL8DmEKYwuFMD_n>GtH}hg&&>TXuRx{75gq- z8jV-qdj*zmV5*|69bhFo1v)FcpkZvnwaWz6!J8U|; z?J{1wwr25oD(wzz8{Or7vGX2B%D>6+CTz;3YPZhdeAai=?oYXNjNs^&t%;z@G!ooY zP+u1ciecLeR$JcpWScORvi!k{Q7mCC0P>3 zJ=ZB)*Wl2)Tk7X^Q497{-Tpbmt#VuWxczWQyeju6Uy>6CcatM{qiV*F$HOBoRM)Ss zaAkbqk4~D}UlyOQbN8ND7K=D+qF;yUtBk?n7L*ms_H(7`FJO+(WWsyQ0 z*B1htuWD(Mm{=0tSX_K@4YPCGz}9=EmhZBLYK6o6cmW5F{i3p9h+>0WwE`iQ3_YGR zTrpWpa@)<@jwW+H2{iKSo6s-q0YUbRtx%4Re#h}?k2K*URst!NR%;2hc8k%p8aFkG zNuqV^xs_$=VLUvxZw$Oc8eN|s!Vfg6-YjKm8|4@SJ9tQlIu_0pWP60 zPSAqw&*ax_+4v{M@(Y3x)5Kx@ANJ~3&WLz|AAdqvs{V`t7O>cLVa-OseR}_AnrrY;dgXAmM`n7-Bjj6rd>6F$(dNFA$%LOUusV# zuI1zjT@sSPOkHA#$%~eC0c2D%0d8*A^Ez4{0=j$zd+O;7z=~Fy%&;;f!&)B2 z)C`$fBU*?=8w`u z0Ayytm-S~MVHI=32;s;*Pa!eiZ_Nda!}QAgs){9%3!9AX?svj0R$`;I+vb_?hXFOJ z%CZUcYSxQU<}94>pRJ8-STV2Dz>g3le6?&m*ur5DkiT0;VzqGidIfp}mel22E{=g* z{5awlr3y<89jwkoSXwghWReL564}are`gv)8(zSDyTvCterX&$=ZVgq-Y#EUN(I+* zo#sd+r+OfsaB9O=m(tdSlwR6?<4FLo4Ck6g%?-BKIS}jM`WH7clb;Mi`Uc~1ritfY zNhxprn07N|9$noc#kcA`%=Ot$1ed4Z6!829Ps-}WkKntEF0`Y{mwh;_(-^m5X2v83 zLL8_eRQMIe;@J6X`(cs``Zj#>&|uxcqmQ%Jrh4H*is0+QvGU}SkQ+n`#{*geFn6xm z2K8daV1OnQ+q4b0@$#mLE`+%#3E3BK9XqWr_SO+D8W|j=usM{~!uH+?Y8fkbI$fa` z9WEtIfztOE?iHh7+H;)eXx zC~(>OTl1-{P&j5q^YV{?DE9GK68!n7x0OOShE*C0vYoreJmGmY>O%qE1>xz+Ut zYQvkdWp>D3hX@;$2Nu`ZS)=PP;}zh8dH+S|Ycf`hmJD4 z#qYR$-1vk!-lJ3@WNMcE7+3;a;r=&yX2q7Wx+zWcP8piy{ccY1^B8dGe6Kd9p zZ$)J(O$qiJbrFCGP(i8&b+%Q?G#+!P)71~XS%&4!-4DG`Wj727P0~lnZ-C8b^lHc=PEBMF2ZIbB=Mi`qBYcF-MV1{(@?U8lyv4i|XXaej(lWaTW7K00^G=4I5sG6b=HP zuf{dp9_B+SvlyJAr6kyEsnb&vRYDMjI2ZWmMZyfkH!uIAKw{6*Ca4f zH&20GiVeBMOR}j^-cRGD`g_5*`fMT>-FaKqS&bDuW2;jZhA50>UK~4=>_21Im9zuL zX;R*^q)Pql^D&fszOTYM{P{tu;?rHQnd`dHAGRt?)iy|RH0Y}_;_Z!xTM?-(nbp;#8RgTxE+_)w#ilql>XN~+(V1(< z^<~f#IC~4`3PI}?p6zapfgs#%2SwK_1%%cnjz}{?E#5P=uW;yVRazw4bW`<Jc@>m`vEsHs#6XU}HyKoC5 zXN^6Vz)Ms8xxPjmD#uBY90fRA-ollvB?#Q;A)efol@YCVBSp@yI%h69O>1@Ga=d1C zg4C0g}6mgMrqhUf53|$85)KdyZ|PjpLUnEveM14kgi2f017Cns>p5X+%f7 ztrmaF=f%V>oj;h`skJ1pmPrObiUm+(*GZiBR-WD>vzva*#1ePw)s%QUXVgxW3|w(h z07EFOLW1MGgf-mp6ouimibDB^|MJz8>?U6>UK2)NZY^AB_sl}zek}PH$W(Btkqe$t z7f#>`@mXD-fj3_$x#;vR>rv^K@r_>1%>MK7Tsy`_!foQ2OyiCGPGSag2}HBETF zI-&fX0^;#=rj_fzqdA|7rmWBY#dxhm>SJpkC9BQbxP0)qj!IETE6(L<$*eMayn#+t zh%IZ`6RM&qyM}Yg841)B9m9^*^^DR8=rU{Tc{T|4v;|2vY+U!7X8)VI+1rhC`Da~?_^I$tyY2k%`ZPf zwZ9QO)*zpE^$!2QCOD8T7Co!>Y@)$`O>9n&4^}#Lmo2guI60Lf~>q z{AO&HTm-U>E>&q%*VB?yPy@7d$gCYQ9Gkwz~`G0q{RG3-*zjl|GhBO=;V3QEA8 z{9}-9mED`Wg*fs3dS%w^tv(>PCSrKSCig(4M{K#~?|S+bpU<>PUz&>hE)r!rWHI7p ziw=XIEzDjA6CX=)(v*r^+PWR*!}Hie$trkHc|<`d`BNO0jTXgWN~|Ab_#b3qTZ3{37i>rz zb#o@rJMgYyN_&-KQ2865mo}KAHRjhvh6Qd5fV<2Q$XYu`W-+waE!wQx9?PR)6sjIR zFRz%RxTUhiTA9wxx-w%O3~r8;rs{Osx86H3=BOwq5s-rHq*uejc$2AC_lV=(bRY)b zt_ZY5nu{u2ZNy^eP`3~} z(ec8XC`1^v!E|e6{440hYbN^Tjnd>C)R+R`aJM^l(>zo(Vhroc;bER4X@da}X=naMXF0A*v|hCjQ+uv|6ds0_}?+~{|%%6gRy`8CuaT!bN>(Q{l7fk zG5-7b|ApEA2Ns8f`OoqH%=ceJ-~Ynw5HJ6E{h#^&_3`iRpFZAy^!^RwLt_8;{(lv* z|8xJl|9<_y&QF5+4@Rfpt9tB2p<&2>Ud6H2R79ag#T%|*Py>HRBB1uUQ7B+h6Z-Ar zeSHr9BxCb!QhEr*QJKt#^t;Jb2EXZgjb(6vnyGovk4&;4@}%RH;L%*%9RJ{H3fx^kknmLcG4^A2%n7+-^| zp=#85dw$izFYzz0WRq zI0L1iPX3qAbRstprGsV<`B%t!O}y}eE1S{E1lZdKxy1b`vRqw4mM2L>$F5wSw?4fq z@UGt-2pcOd)feL1dhmsui*B$ys}xqAn=HXCB$XAYxW#OHYGh%IwG$4}lrJ>=d&Ja~ zNd}kJ*g^{!GcP=&*#SX_-Qg6jWkQ#Eq&MvnUc=FnVm8jYprUO zxQ|t{n#-0OKqwac>7fGL&5xR~{91VcOe|G{EJU-rHxLX@WB4XDpDT^Kz@`#Zmk>g( znJxQXpY?7?Qv7Yy%=22>Ygu8fzI&3l$xaWqsCHIr@e$4x&gPrcAq~d1>4BlIZksod zYliJOQZwrzxAISSEYUUEcKd5D_z$7N>F?v@U&j@LCvr&DE9ESPA%uN;_{954oI%}L zK}FQb1mSiJ;5e#et)b;gkZSVjOGIeKN>HKdM5maIXA2t3&ELr^toDs0bo-#`%2$&l z%l4Fw5nzqLY^1$vLN5mFA~@Xpx5n{8g)CRr%A)bh){rQ2wk#Hn`SM=5gL5N;lRZVF zbz@Gi{jxDVL8C26`nJKUQ7{2K=4exVjfp9&edAM#nPF38E{KmNVV|C#c`H_v*e&b# z26ok8FE&)&G)1>-SV>(k0nsl&D*3WrZg4rUoYra#_U*<&KSaAaisr-v!(yg{q;O>| zz1;rVD9~$${3ug8nsB5N$!2!OTcA{3!61jw5EI&LF%i8KA!$>pBD_oq_9&tp7*F7; zm&yfJr(P)6!NI0$Q0E^?@kc-B3*h4L= zk)eLrJjS}vpv6ztTFiq#YmXbd5sG%H=NcuB)v&X6r4IvO} zt)+2vq}2Pwj;Vla;4-&1hE)EQOWJ$Apye@Z9?X^(Zgm6kT}Fa23K4dUsl9d%8aaNu znZIHjT9SCiDqXT#XgKPdZQGn{(X{nOG*^g1wko4d<*Jp{f1!EucXRl%NPI>M47BTzqE9eKU#$|)m(LbNSU4n zR1)(GLK3bE7>+zPpPN4uMIGr}d9Y6L6V@yk8z#9tH%k$hWOg<%7hvn5OJ6m-amIYj za(2{9IFw`K)-uTnXn)G!!BtbO+y0p*R25RUCd>%MFmX#?O4m%!D9=P$&RCagTS`WF z6jH*%4FuLxk_uf7*+i(Seo6Q=P1TWO4z$={jve=Saw$#BP_etbO)+{_V60}6a^S|y z@waLiny|EwqCT`$yMj8V)(W?xb zqg3z2lI<^JOcg1kA!&Bg*tkqDPsQ=dsw-UNd3{fj7ees1Fi6D z@#J_LeUQL3xXKrzH$P0WtPFFZxWd^6QiUMJMjUH+;&Ut5mFcH5L3c0%!}Q%%&Zqo5 zT5%z}w;C=gXvwkWiac%;GCHfG-f#(Vo5JA&lwfgjf4WS$PewM`&!+V-2jDw1SN8-j z3L>i}IAapinz6W$;IHQOn8vu{#fplNJNU<+g*qKhcv`907s9CIX&zRKj$Q?-2xXni zb>?gVo1*+4Z6D!PjhZbAA`={hbY$a4PQm*!>c9j+T@}X4{U}$PSLzECR2q#k`pR%K zf@@bur0XZHe&kTBOvWV;`jkock)42$7W~W#_rEf3CNskbsL|@T+)A3>@+3JCiE|F+P1* zQUY6Z??J>c1YYU)N^E7N_~;vf5!@0=8M;M5>8Jv)zi1r#*d+;|nky?T9^dmh&c^%$ zAMF~lR$`X>8CJkm_>MXHODQFoSwD1m}k>j(BAMavswegWMP0S*r9_S6wk82#LK2^ z!#Fj^W+~r`7t;hvF;1qkya@tqk4gk9 z&L@PYVFEb9^Sbc{IL`L9v1>ktEKYI#ZiTyh!ZZrlnJgI0X^PXicBp_8`{6@upACld z{=n7(?g4`uk#V!-PPq0&u@rO&uzFDOnL=-sMTuz(*q>+iv7ch%eAZz6$X>`a@wMZs%r?G_1`#q%>C|+tDQzoK_<-(ps zh)d18E4eHI6L@_`2b)}epUM2nz3Ov!aX^EBuD3ENr2{K&78Q|X*rV1fjNsiT1Lb6& znK5MwDOr$)TSBCE^*vLSQUmKag-)oh{a5vVhaAQcO!w{2=lNcb#xK8`yxv4td!s_h zH&-}iHt%{Zwej#W>61D&Lpc^lSyl^}+ZSG423NM2+eHchZbd@&m?)~|J137LA*ulJ z43o(xvrpI*%42l)yq^K|BR;JaYoFIX&5J+$E1SZ_SJ`AK^-`Bua?)J+P*$H>q7PXJ zKY4KLI*BC$c{b=MR8OmUqn;TIohkBtqVCv8Y8n4H!HVEQ8|pcqszzp4W?pVloq$>F zFt#Sm$T_-%FA~KB(gJ6Llz0V%vBUOTllCW?y;<$A`qSuLiKt_@WlV7z;u0j1qz>m$ z|M@?ZAV~1JzpSrWPjjx{10kx>vz3R{-?4Z}fy5D-*o8~b$|Tr?FX_9%CZaIq1M0R* zF)8vYrI58abfbN(l>nku`fzf^z}GcG_&Nu+ei#^Wa>ZsTHr86+)r?CVTU=Lb?DsqR z5u~62g#!&IVVDx8LRQbEwx26;80xN>l4QM(6ly_J%`FW$_AZiynJ@<7gSi9Tb-{zU zOf1VZ(Q{ncx6*2uhc~?0AdZ*v$Pw=IC2gAbKq#6Lwb&?S+*kx5+az{C`JEVyGMQJ} z8U22mvdeC}JsF=BcTA`*f|`Uv0(yTWc_fb00SZ5pI4lE|s$u+l2;X7sf=3;plr@e^ z7)F^I%@m(QMc`Z=;nn&Q4btNMO)L-;9{KJ@MEo30}?Ag1pmjD%&2QVUwc$$$6m`dWAX*0%LsLAtxH{kew(Zh$AX{|CX4{%$%Yt%Lo zTi@*x8zvUyIfyg*UgQh@_FW}=$(2NmV$Jg=sMoF#hw`V62f1O34$35p!tB4qFNR)_lA>Z}#@sNrht2YC7vV0J_#pCG(p=5E! zc)E9RfP{_*r9wL0+Z`r2`%&Rlb1XUbtDJ1^pv|VpxV4TxF zSI#T4TTmTNzr*9qEF4tL!AW(*jww_dV<>)@r=~t!v6^WZ&)O}~8ta=uxGM$=ao`i( zedo);-LC|~9)^FWWJ^p~}U16GYfv(F^8 zHTVCOJzJ_I2C%`_R%+&I5F5PLG#yaWT(eE9jKHZimhbOTjn%@{sqzRnUdNVXFYL!wt0AsR%;8buz1vp76rz#<8etCzoDpJdGR z`~#YO3!XQX_g9o)Z`P|6v7h?I=p!hv#!Fru>Q-c=qJL6`M%^maSEbO>Lpt(7ORgYF z{pm4wxkR&W??%0IU1$9wFGE{Mw}OSVY>eb&dcSFm*Fa~PVgWn3K5{HY3~!^dAWPG5 zPn=@FDLkc@jfYj4JCeiAc~R!&0p?~Kj}b9|E*r7>4h+Q9swYnlDNcYdmbo+vvKwz( zD7tcK;pOJmRE@|(s%Z}N?<;-voB?|#hl%V(NbW7KGIU6+INH&RzK91mpSuDZZ)?si zi(={JRFqN8lXUH@Ia#722Vbl%0G$b%(vZ)Znx6C;!I9XSdeG_EX+H>?Gu2MqtIx3$ z(M!asyQ4>FXO+lPrYaY#iwCts2za#S6dk5J+P{{=kTsrY)oa>2tI0O=CFE#_{Pu%W zNM*WfQn(oFCltBqnPS!JeG$RzaR+29fa( z4YRCSsDBz%)YJZia4&W91ItULlXuO*t4VHBlNJZZqDSVlxz}Xd?kRWawPqUAea0vA zgT(p_Wwvh6sb*Im&v#66T}@DMN9@Rh8z=Pd32XU$Q**xP@2Y7^PTkByOS5UFP9*20 zUialLS6nZ(o=w(Ie^GKuHX>LmH^lWd+N$AAr2^e97ZH03!Ph*(>Wj*&dR4qW%7YF} zAF`Xav!7dpdeS%Di&2(%VoU+L<}=(IY{HmTcl`C|k$kTArYZV{#`N^=PtP3G62}_i zQ`!r~9q3+~wiOhUX2Y$VdQ+|zwAMA;2PHpTu`VdoS+&15NT#;H@n_)IH+!}q!?k>) zxhmzA!(ZHprcM2X`s0C{LxPvo5Cj5U9H<{75sZk4#2zRgIL)>fs}TDcWsN9VtBUy| za&VZA%TiSo=Mrh#W^aX^lnPT;*RGqX#&{`%SU_$OQTAd>?)Z#@-euwOyjIKKYnFtt zD4Xjtm+2cd663Rn=t+gLyWW#8iz~P>U`;a*{UBvi8o6w}@B?^-UzM$m4^PGyG#0Cz zATr57Ow=c@8df=h<%o6i{c9PD()UV;h73N%R7CbZaQ}J-)HYEt86dZ9Ju<=b9^Ol7 z&d|w)%9S=o#CDYT@Gxt?j0vVw3z`#Q*^&chXDDf;$4;}=H-W|urA%du8`5vz5+YW9 z2(tpuk~dXiDZ?#LSZp$9@<8Lu;?0LlPc>g$2BgmQT*}YAp?r(Z2>=emRVJqu?2p;C z$8U!G^IBOuO)?0;^{wR8uyb}?QwaZ!-Uw3G{ zvcPfS%V{Yy#S4#7muOGH`>|ReE^NzQ&OS_v?_?tC(r{HP2UhybWjD_=*adl)ImuNv znPh6O+dGuZa4lOo)Rt}w5KACLbs!Lp^tC@_@2CO$$Z?cMWN>e*4>uVN%~KGy-c#^KdOdON$p$&x&0R))A^ufoB1v;Vb^6MexMD~WlBF6Xd30aN z@@ok!>9f||-YoVcg^ok+!dGQHRwmJz;|*rNXW;t9_)sRyP4k>+f6SrIuSRV$QQ@CG zz5qu57L|-CUHfSal{MA}-;R1XeGT2i2Q~@)jnLk+SgCZ?B6lSL1QDh{*z0AUY`wbI zD;BF~?n&fSI_98m`ip+PwQltyuHm#SN`{wo=g^erq*bkt<1bFGv|QZj;!GGE1meeQ zQ7K1(JQN7cDl@!Nr*^2B*)5xRk66h-mOS$LlnQwo+g?wywOuA@l96syo;A?=S*~b~ zCyvB>&?8UX?QhR^*fEZ-{gdTtxUkp51yfIw_0(afVA}Sn$Li`Ky{A&&H@R&5zk=YC zb2+iIJSPh@p`X_5k>X@fDln~B0TfW78f3H#uL3Hv18&$0*cb76=vj_fslYS9P$uz) z#3*)l#?cnPuh(40r;Ml4GJZCmd@(3AvZSgWtTEq@f-+`oNGPdKc0}63qAe?V?BPl5 zvcZq53tZnqUeM0BS`I`lDfg_pE+E9}eWo2e9n7KI&VKuLH@#0i5a}a6gii^Pf2Lms zF~wECxc52@jZtn>!V22YX-oQRlPcVuE3UWhZaM;OTH~u>-fql_x}hl3ocFhB)2Dy& z%25md<-WozYjl6@W0-F8;X6q6SAR5XSnp(w!fvYHKHn0-`}OABP+CV{ZOtW*U}pfo zmPT;tZW^C}LVj4wi5(w(0v8k${R3$J%~?v(MikD zKmc99u%8=0{#t_~zNJt}HVD}Eaol}OZf#m!sZWcSWH`z@t1J8#g9vg_&g{e1K1(1BuJ z?#=ooZDJGi!^gj4&UU-Z+TUCiPVD2%OBH~_+IVYuopV<)HNU49cMB&+J=mNQ40G$ZM_AwZ4w$HqP^?1-0F<7^ zKU*it=yB37BC;(!PjM(CXV2WrQJfJAREoviK*#Hj zcwyEg^Mvi(Pjx)@6;OuYIy>|wR-9SE+bn*UkpwWD(+6a(U-=%5S)`JJalWFye8G~; zfq*`4vxSa=@MuG?)fys2MlD+vIyENTsD)<~C{ee`7oQJOgRa74LdYn_R z3N+0M+io{%#VkCCk~k5gc`$Ew9Ig8!m33rt8-RIFax>R_W9H)c77}ZC?hb|I*Ji7* zb({MOS&`6~lHeOw0|}rfu=D9lJ`a~bTdQU=S1!mIh3wi@I3i@Zld(`aL&;Zz5`<37 zn52 zT|5SV>Fp`nRv{XsI=PW7owr8g_qE6z{E~9%vbH+Z;xAg~MEdFcQ0SDvm>{I6S`46$^=cgtr9<>JR>=j8pTO*)V1^N->w4@*rk%VM{wt1Esnpcl4hmI<{Na zdc#H4t6z6w)p2oz*ujw8sw7CMb6wcv6Y(WG{tv22LbXJvMW`{MPR?>oFcjt5nZ(Np zGCbEzx*P$UGW-6SXb_KK32x;yE6AKw+-X){BD?zWvs%HJVz+@&p*G@rhnsg34ns&e z!0ez>#2)K?2CSQ5&m{)-@Sj|ZR9P@>SxJ(Oi+lq|GBpH$d$|8r zqXtr7Fff`q>DEa}UM^hD2em-J!oimd3jTP>PYYeHYYV@TLtJa2Sst%L$Nk+pQ2KSk zn0AiqQ7X`r7*ksw$D#US-M$68jv}*63~01*w?UOIQ#e%GU>|uX3h}b?Kx%BYO6f?< zV~mg3)|f8lO51ZHTfe@vG6kG5*N7Af#^-uXnHA-fqz-BP7JD*zB#$kf)1L}U=g)Td zS&LVAI5YC5S!FR^*$;X>ShczhjF zGv}i!QK$r7n2UYAq6?#P_irmX3UD7y=YC3N9Z86BMbBn0h)_3zlyMt z7#9!tT;-m&VlzigT|=ayr9DbeixSLKV~r#|oP^p!Y%U^X6^8P7t+;OwJ-=E>N+4WW zBu?QIOn5$SQK!Acvm|PHHjuzj`PBIJTab-=PGhQ7oTou?FPM}YVbjw z|F4$qAGG@aMbiW2{*5mG-)Qa6>{Ev_-3L7;9 z*A6D~Wto(vvQQ>}8Ei7EE?# zeU{za-hgr-?l2L@vbfC9L%;>wu!pl+x(T0_eJLnd&PIR;0*Ob1QT7-n zi(_Q^)w&qV7`=s3cdZji?aXm8lR=YG_gj^biFPJZK$HU6^Xkj*G`Qwxwd%i`=iI#> zwHxJF9UJ?jf~$GvPmG`~aRP#P7MxnyA1+?XO7=)7`3?U?)Lo;95JOP%2lReuE5)&V zZ=5A|&DQ5mOhOC+&ILP_6jzw5#JZ`&rVBfd6HkAqHDkHa`jboJlz*i|m!ysRXgur- z@cDL9IrN|fQm8gSK|C$eD)n%Jf6{w2Wb#o|MwybfCjBZcsjl2S&JJle| zpRR0H8uy$m!B#VQJ}^1ki?5eJ)APfm#~09ixd1TaNrv!@0&J!s<&e~Wumm37Q%0;O*lyP(-$kg+k-Zd?v z>qBPcPG%$|LCA~&Wt|KgCrsYTXr-7v{3r6;>IK0YD%SdJ_rKlccM|lqKSW-;#6`>4 z>=fJV+dSYAmK6;G$uc}@dNqi7h1NUjltT(9jn83BYRBB=hs~WZ`PRiT85HxYw&CWcz3Ia8yjs7ha^oU&^;rQ+jh}2f{^fh7Y7*H(Z8na6qR-$kKyS*-;@~I2w0auN$Y1&^B@>r&VF>F$91l)H~g)PFsE`#qRne9!#0124}9t4n5)k;Q80=J0r% zCIaIDjdX1HQkb>G(8c;W@RJFhE~$q`{*i#$b@&RK+iIMW!QYAt zGi;MK}ZVi;wn0MZR}>0({Hgm?hPP zVuXisF^!!0bc;!vOQtZ}{fZ6Hsp5D7h=p=$xID0pon~1V3DR+_s-#(fM^bM-iaD7E zU8bfysn-P#6;U`4)e7FoM|OL(O0O$-FAKSSB}NTj*F~Zs6Z8vF5V-g`dEL_-yS;Ou z6_~iWo18vTo;F)lX&4z|mm;@bpJqcj&sLO5&u0SZCMkvRyja((B$Csi*eg_xnhG$Z z=7}M?exIrl_&Suq?3GEXI#WcqmQ1bra1}SWBHnmlmK;l!{F2)A<9fZK0bIH4P~^&S zX>-g8E6Je$uRXKtOZ&$6wFiNLi_17>mMePX?rnrLQ$1o@Sg!Gu-zDv-KkdzAIIRK) zW#aWMZly4qJJCuH8gXE$tjsKfL>jnJze}qw6^x)}8CSI+*lckiHllcmlHpg;o%8+C zcrCG16~cv5dL?{N3<-+d&M-30<9{a*MB>HQ`HChKsPSd#ylKuI4~)kR_8gyR)DL=Q zV&YLjGE5o4(I`sewiYiVS~;5{Tz~26L#*g69m+-Jp04-Ced#baIqD-h??(9|xupix z*JL5?%6n_$Kux)1QKaA-(6iJ)9f`g5=<%Mh3RsCA~O4Ka+EMeYi_lqiPvKuT4XWhBup)zqs zUfi0DrcG24ec2cFL_;nXX!Apr_;;lzX zqBgs@kDUgOy!dyylX)yZ$$IFUO7{~Z$;j#Baat5%ZcFNt5(pLZ&3fF33;^bh00ZdmxsXdjsyfK(5F!hF_yQr+*2Wn z&|Tk09-dAhQ->0*o(;5m^QZbBt9Wt!AI8oysI6~d_l1_?v`BC$F2Ra>i@OGbLvaaK z+@Yn#U4pw42=4Cg5Zv7~&pD_3#M#nQH27*u2WIG})zm(h59x4uB|}T_t-KSDRc4i4pqNcx zDwj*>+di9pb!M!X6mAoe}CDG@?I#yQX*!|>Jy3eA7Mb8Smib}a> zwvI`5_g;Ulm@?TCW7#B6^$lHLA7x$hQR^{_Edv$f=_j(xz#8 zrs@)b-TFyzD{*U_x30W2Myc;; zyX+DbmT@Xp-IT}v+uX=Rq$@@`EEQD^X;=au2H+z&>&o*ereR?56clFfBnY zx8NwfrU}6rNCpa)&|We&(0l_8De%ZoH)KU-xU=&u+3A@ZsIb_4q%kN+?M`t>T}Mx>_-mSxF#_*jg};-w7WQEUU3px)sY`34PY40l{d&wbToy5@=~m8wz!PTN=N z4pU2SrTZlv$B`r~?Y~Oz0pgz(*)Qg52mU09h>4qgR%*EV0xx0TNS}q(W#{^9YFWi~ zS}F3U3?%6~(_YnUSjCXwi1=0PqCdU5SiRvT(WQ>bt%J!}+v_}9kFemmee=@NH-Gr& z(z_9tiW&ZROr>=@o2y?Bm!VQfth%hcg5ym7z@zQ`TPzX#m8`jfw2RfHsdG#8N@T1Y zD7LAp8gk_zv0OSz6g@lqV3bgO|3SPMzWm9~F;ZCnLTXDF-&mqij3W7k1qr)XI@^e) z%h$3Tkd9cbAu3+39+&pR7-oCkfbUJ@_oAt)sYUXkp>EKzX3cPLj?; zzOSe#WXiB#x9D?~l`yO*AP>ID^!YP*1hC*A$N*gz5)P?Y;vmFf;x|WFfN)8@_Qo1Y zX07e_yN9WwRHS%4U@43m3vZBX)4hUEt9Hhf`Q zE>ye%v@1|+>8$JbUT3vgmNF^0m7xI>$BW1$8$ua~;y~e5N%W?8Mub^nm)AeD2$?Ljb{<};m$-&oO8<_Kx4$eNEvB=6h5@o(s9uzvsvJFPR~Mohf#}< z$FOLya(AIgk8jV$(kc{zP2bM4730=T>s{KMGQ)3Hj4y3Q3n(k%{xOwYW|!Kb$(GEN z22fHh__7QBQF)-Fl`oN34P_mtBZB$kO~w6FPA)!NsW~3glp~!k^0#q^ZD5O5bZ(kxH(%x zXK2rg2)4pRQkty|!kV3IV^b}gW~N{~#IA*kQ4$&`_;_&%37=d@s70bmXmCLNLvs_nNlpMwJk2^+B^j|&zJhVC26f^iz;I>&n`=M->jobIn14Wo{rgl z>fOndw?-Ug?=ye1{B)bs)05J6T0E;8J|W> zR0zqR3z_=Qc`n@(;dE~qL$txFt!Ngom*xEvSqgUGMQ*Nv0OBD^;JKE}I6erk2`T_lkUf!ssOSHM@&jP5QvW?SpD)bml%(})c z66Z<7>_6&Ha=vFqxhhFyuILQ}=bPHr-7Yffk%XU>5$T-(-$1p9H_hNQvZxq7pp-aV zV3dE&pY{IZ$2(GD2z#khzm~y;`^e=fErGH@O|uTmF-(%vhk3@vR))u8jNvehMSKOXa zRw{+PufcOi?xUu1K!4xp$7CEb?YOX=gb z!?thS*+7%^#f63A4a?d9)R$e4w|LBL%0Wj(&pYSO8>w@0%|q?4D`6CkmxJlaGwQCT z(qi9&2MMEbH-rg|wCEFxntUCR->#0-2iyG~j*ZIvA|FZjU_Nr9t|h|tLRv1$>2Bmj z>z_N5#fPDyKTPB5(u+Tb{7gnIdIdUR7gFp%2`^Hy-GKHz_@nAy{uH;0gzqgjzgomn zE^N&T$Sw+LgTt89tln70HSVq2MiRTq?F2h}lByMcl%L+R@V~a}Y(}y2;wsV~DOYXS z$B40~raV6Yne30T#PGB8;KmjPkZ~nP`tlSV~SHX*3)>i4l7%?c6R02iV+gTy>R4k|4H(flzEcsO$@zQ zZZ!*2q)hfe0!MX#JqLR#KC**(N|H;%sVqDJLt{Rj1d;D`bB`6$hQ5jT6#X&p3TDF&a@QAH^A~n^i_cr^ zPdoN-sD*=enlocHac!qsbaAm3fe#e~m{^dS3vZ}{Jbo91(d^*{d8u1mZwjtf4S z5cpjpwn zVNQ|?V{koGFJTDedt;MVlYU!H)vm(3GZ>?d&lLV@S8syhIX+F=>zBZ#hzDbW@TU;# zF8wgMjPI2B>_Md6RZ7AL)-UaA_FD$h>V|g^TvMsfGeu!}Kg+ z?K|V909xR--ALYkt%Df*%{1o1FQnH36+HSBI6!0pk5b|RA-6rDrcnhDFqf)DK9GNU zDAsUE>>CR~B2$vnFg8WK=3yEi%PmK(ofjFHjeZR1MeQ0I%@Lp-|+L-<6sFS@W8t^GfWv=of zO{IfdQbwiW(#$wW_TtyQ0w!O?DGLN>E^0=5L`eiIB@<3 z-FUm=O+TvuK;EqTGyPCTvKVP$6twbz#(WJ??aJ{6OM3_b{sx$55zpoA?4oMqV}&xF zNj)3b2}MXr+4x#d$eCezPhKJkt=zuvn@d;*?1dwmHhG;ivUyc*jwFzhOqbqc{>ladVw(LHn0H(0c-KRAWb@V zV%uJ(rH^J_CAfJ+c)CI-!=w>o>#}Gy9tif8<`((NGauWLJvY;5~t5j(9|~ESZ$FOFpc3 z#14*1x^w_IlU2b6`?Fn(*n2s>>iaNmeGZb@143Zq7wU?kEBxjx)$M?`~)Vfm4PTPH2T-UM1>wn1bxk;qk3*) z5NeGhBNJ3u8Vfoztz5k*-b|(>@BP{3kW_KhYf0+5#KgPQSnE)PKx{D)USL(Kb?4ja zJc`KU>S%uaIl5YMpFvfSEb)0U<4z>fXk4Cq2OBWz4fzosVK<{40N0^ryr`buTji3- zYMVoC@Wn5IN&+czIdl!4HjTORp-$l4iC#4`sE})(QAUmX2emu)Yns;_CUGRc8L!sA zQdyMH$H(w5@=Z{XYH)Q?G{S%8-dL`mJ@u>T_#{_FH<@yDRE@)%D!!keb12?5zSKH6 z&e768Ne`Xrh1|jTJ!nUhpF<^<6*GYBqXUgV`nTIv%QO-*0P%J^uYh}W6R@P42h zvSqim_*N~IPS8{$N&TvrQ3-eNfVMF0m$10vfmRRGtX{rN_Mp+7A%zmO$M4MWiJP`R ztznqP)gM?j9w{~&179-^JiJzbXJKQ%Oz2p5bKvn3B#a8dwARIyR%h`d2K8D(F*!%8 zG9LAG3&9%8+s$ zQjw}NMy8fdAWcO-9HKXLSAjENu>wWSxKVom?X=0%DZ1m^+ zHDzx4Lh-P?gec*N1Zdzyu+co7U8>9es%#R1g1nNvl0^Gk;b3F*Crs3Tf{A&`q|iff zktWjTEijPHR{age=cx_E^68ze(C2_?}qn^?cce6wPGgO`fhFJikb z-Jgs+su%t$iZz_54?L|cSVUfh{=SI2Zh4xJweP-BIrx^Ef8$F|pFltfN$8fQ(OVGt z{kLGhs$=!22p-kugIQJg@l|o-8D%g3F9t^A-s46d7RN~?Z$GAug}DaYp!vY5z8Cr> z-jB@qhE2|BGt>M^QO4_J5!KOvV54p#Nv_InMt){x47Y z&;L38*L?pm{?B4_r zor+3@5-N4nJ^F;d9}M;h3qFRy3Lt*5KS|-d-X9It4k>Rxc^9q8EneKM@U z>#vCEW9Pz+WoF#jM{E|`zB7LQ3+bJA;qDsCe?|l5Y=M8c(Kqk9uT!xx3mh7%*49p; zMI6!&FrwPU-h0xHk4w&(!3p?fCF6YEB1`Dn_V zwIoFA3V;{Z)TFk>d5F{;4CP7xK`067QdjaPxAtecz_lLdinv3Y0R(GS%?%D^zgk9@ z>kb1)8`wp3m8I(m!(J#dYBHA0MTMcS@kmQ-&n=#Q9{q|za2hV?#5(Ly2?|CCzi;nL zQF9%sSQAoSj*0QLAIj%??vI|3c(6>=3_CPMtisc3(NLyXZfo;2TauXR-F+c*0ws-< zk*RQ(w`9cA7E}hl?gfL~172SWRvh{3+DCES0CyE-49pUYy2#qti$c&Sp`TeXC|SS7 z3PuUYRyu&v<2Y;kyiO+J{U~^Aq*~lOUJ+HouFlAULN-udJ$1O6t{D7{he8%FqRp1? z=enfOHP%o=p-{UU1|-j=6!jzOb4FjqjTX5)0ivh$uovAB;x<^fJ<DLeYf+~; zK0CXnf8in3lIti%U?vcXs#ox-0uR3eQ&kW{>sRqzxhAO7dY=SYV+;x}*&(_JB1MYU z8|+{!H6X?7l~%?I$Cc}wpMyLyS?EJ@A})<+_Jd)vyZ}ZOhqA9Oj*5_PL-oOd#`(Pr zU}4YLWRd9ST-VBILIOEtO*{AW_LiNZ;<~v$B(<_vxWe-VL#f~dq0Y*^6F`DzieBri1FfTll8|Rc)UB%-aSyiT-)*4_bKBwAzub zOz+Y$oWxCTSB0Ukv+pH+k_r8SsZSvLtVSD?iR=V{%(vTYAx zBccju6gi&0qh@4C9&aVXs=yK>O~VO9$|MNE}kT#-5E#{rTh@i5XDaz%3q{y6-P*#(m&+qP(CU@k~d24%oS%ttX{BwZ<7lXv0ub=hT42k-y_rWT(o?{s- z1-GOi&t=f1>)+w&2g&P9k{KULpJ=Lh#e^$Nx<2xAg^I2jib8dwV~lTVlKi!2lYrMP z!Ac^pEsAsoGb@YFK5%0tQ7^ZDz`@kXPPqNhl-{Ko?LTpF>~9MB26KzaIahm44LgQ6 zpO+{6(sH-3<0Ge&6gTXO$N0<#SG6ghiCxBVBS^9jJNMw#?<&Bq{SBKZT(jI$F(Do3 z78MHvli*K9JR(;#h91*7WHJWEx>!#%OZ%I6X;bV*u@}Xx68$M;iXB#^kh=E7X{;6ONUx-!cx;v^mnUJ=tq7BH)2D+p$}ec@rdoliM3sQ} zBxyw(Og+$x`o=YhB#+2R;lt#bH@ez0@xB0P}Tj3 zMDwrv0HGwVso0m`65azA=lhe$yH3lJ^thI)Qd7XmZZ}Pzp*5FEp4iTjgf?reRC&3V zoyxdv6Dt}Hbp<#01Q6FW$koiHZPG-a;})QY7x_rRs#5N2yUOd1LIbZ1iYYn~G*609 z>vHVb=2G*!6S1H_oEW+Ll#|CB8XNK)Xj|g;2?c&8dUTf%qZ8Stxb2N2SJIVCNi5w= zenrT5sqad(b-&yz05fx5ZN5H|O0@B5`gP+)ta7Nh89b<4)Yg=T;kFs#CW6lRp5a!uE(|% zA$wMUo9I@<>)i-P5Jkfe_fSS}9$CDOHro+Qp{`z_p~2KdkJ|3gjV}2hXE2()suYG> zV0LUbq&G2)DZ$`#todkbT|>s~9HSanulNt`Djn^lN10;}-b;+%qPWuJPpxi0u_w#= z{}gDByJvzFlvcU8W8?JLYW<%o7OVs{5|RQ_WNf9@TV@;RNTv(fmxsbV^V%(-RS8uF z1HMfN*qY{0&2rq8-1Xb1)5^t-iO0OW10X|AtPw`2O6S|CN61dRsv1(3>46 za}y;c%)|Tq(7`cV*I5W7VaG2|^gXaOpcBJxJNUyPe8|hhN$+TJ1h=1nR=K#Jt!W6f z&;+BH8jZ|7$~TFbMG%zlJ9KAF;z)!LQc@{{rHk|Ta^c&EurW;Iw)+&yynCU zGrzxa-kkL(o`xpA=&BVNEvT=@4My>(g?X3nlT0<9nS z+e@#IIRZp@)4f717LyKZOQ;4k1xpinLIw%*d2g2#WVI5VahozFwyC!YpMTEPdv+`S z;Epz)Qu{)LZ60q^_{~T@0Tk2Yl-|Nn`m5*gsTTz-fu*}vMT_%P@WnP%~@{JNTN7D6_(M6i+E~t!*Jel_)M_LH^ z^)}D8y|RM ztq(ujt+rMuPC zR2C%WJ7mQ|zJr=_zDhvltVQCu4FI3#VP%;1>K~puS2DyJk?R!DSbxw+xwMd;_*GL{ zqW4?okTL*8a{2^97i)Ls3ccj3u@$6t{BS@>K6U1KUH(bei8iHyURC*CW#JD8aV)u) zK-0ajqIXu1j-jO_S;d-q)g=+d8_Nb!Re7hIwf9s~uUDqblC{Q$35+O9?dtXkahjdS zDor<{-xfQO$arA;4U>B4d?LDX>{AZUew$k*7C|Nwb6%2*aNbz8qFY#A!FejxDKVMkyM8Oqu zZFM)_8MB&FgF@jW^Pq9c`MRvwoQ%=)`d_}hY!o}rf+sSia%U!+{%F6u{f*tY^;M3F z@;a}b!m)>jg?3cRo=$yHi+szxsf|tp%3B^WXFPuuYfyK9H8=|S`gqGab4@M6eVq-R z54zx5V5GIgV{fT7ZKe@!G`~qwny-~gH!UNF(LthivHo&?6Z?jy&vZHgEUp#TP2HIM zKF!l|TCS=7g!NPg9&5R)iKFtxz_eC;yausdi3d z6DzOzp0x5kRoNG-Y@Ib<*vYT3rX^|J_VQtbq7|E~THRM491n}I&q+y!vb+soGmd_q z$l@^39-X>#JI{Zje0gjsFo3Q}h#&4P4#o?VmMz;0x~?hlrNHhM;#gd2VV}Kc^&qzy zyq(HsbVt;1PguwB=Ppr~sEd)R<$=F{g>6G7_h5UeON4cMm8(|m zCN>F(j)VK?#;saT^G_4>Cn3pvxWVc-Iv<=bLw4Gv*+-`_qR$5Bfo%rCPi(e|PaPQx z#$zIzO~b#M!r$AUgt8`&-8TobPL*z!zXX1x|5#HM*f&z};(MkiIZRk5mZX)VwFrc!j8YT^2!yl4*;SoYochCfop>QC>W;W?r z?NxSVzH2=CJT6_D#wpqy^(O^-{z;Qyl6xn!Bdhn=^b7|TwqA6D*`gTJXZj50sc33- z1xfVM7?(WDlUGjj0X5A8Gv+!{#+ zCQlvMwk;!2XpvqDPcrr(C>Xo7sytqB0^FfS7%er`io8?Y;w6@Nvg}13xzZve!8H^H`lSsNsk!%P;q$c$<0p!oaDZ`kzlDXKr(&VvY#iP9lT&tkvy{T}76sf|O`iZ~a=R~F{=6P60_tx#{SK zDhNuN>JoGvC)(42aSPysiP+JSJhfOo^UwETD4}2at>3(msOZ@7%f}h?syzN3(Y;Uf z$L}=ttSQ^j(r&4CxpLT-LE|`?Yy2IVD}}aF6IFZOOz`z&r|stz$K5T0o!Xed&G^v@ z-V5K-9n-O#D@7PlwI|t!S^35}|Kf^~-JEKrCN|9FcIFG%yt1k+CzpafQl3Ug#53L!t@8}!RY6~cJkX^FpU@u@%NgJBX7z_D)#b(^Hx+kti z3e-0|^-QDWm99Feh-wfw#P~>iyfq1fiXRU&N-8&LOC9fQ+4o(UI(Q4Mk;fB$%qtXp zOvei}z}G!)#PGDl8B3u9J_ z!fpq#lY@axkLmU4GG91;3awVlnii>xHFK1q7ve557|MOSpv!5RC$nxxzq~8yf`afy zUx~vs5=C{AWML6}Q(g^fibq|7Ln%#*R9jQThS#6{GxzWtMQln|9P}QI(Uu*+0{$f4 zO@SsS6N@C%$;fdoX#s*S6Az5^AtOVufN6!1pnz&5hsgslpO(`8nEVvz|EA`Xdy3P& z%N-h7mBFvtjR!1E19Xx3s$9tORTIVA*MC`Xq}J%4^|e1HiGNDJ%y!qFe^{^e+KD!7 zYZ5=lp6|eZlhgBdAjIscecgARc2(3&Aa3Y+m0KSt4#|rXyX_^y4^Qk7-m$C=A;KBL zWzk})n!H`fU}iLF#Q zOgm?>O_eNwoGK-MNz22|#W5;Qs6`>dZJTHpHUy^SyVbKnTY!5NB#R$4kB8@`{>*CN z@Yj>ORPB%AR>wnfl9#Z-dy_aWuA~|6>dI(zA*G4BlV6a_j^D!To5&dKkakc_Q}{Gc z!S*=zxl8;cmgmUK1UWquU3_4;>pNkV`hn@W`5tRJp~ydN4xB-DWg@%2Yzoi?(rZS- zl0>kyyc+K6Ya`8(1(R2A_UbT_-qo$AR}Q+_m6JMdO>J)02EqzQ65OT;FZkRYRQO!c z--{<;D3FgzL&E|>`szMe+?`LvLk07_&rvb5z6X>DPmRNidwi%LZcAnt&|V04esi^~ z86%p|s4{a&pY*^~?bX@7IhCPE-#%5hO2H!N-}$nWWIUXZDdvU|3@TvRO4v z&o77)8VTMoRtAoeim+z@({KnS{en1b-?mFl9E)4<2|;(aN)$+iA6Rv#a_Cm(tQJKK z4GFbXZF3s=?f~IZX}{8P(45ntBIAe|7RreMk}xNPMt&R+>O`(HyA+u5D+7?(4px4> zCzG0HM&XeRZ4T_N3L(&>Q|S%%8z=KZ1Zf#MM*C-k=N);Y zjWgu6)YNl|y61J5E~*!M_mkc~>fzAv)SIdTEs-}=2}98LuN13Ie`~)AbimsT3T2rp z0d`}ZvoeH;;9C&z5YJv})aaq9`yw@ze5TbW(4SL)D89&^)3K@bqp{O~So!k`lef3^ahC(h+uyn=o8lP0d&k!^D6#Z%)7E}( zsSsEu#h=h<;U80Jmw6}B&V!Qsl7KG0Q18`XTzN6Nb@1z+%dCfU_ zYE```42i-FtX5G$dLL*{N2A7hymp#DwHt~(ilGqU9!DgRXR$D?LaO#kUQyf(8e-*VOK zcOweh8|;n2Z{?!0UY~G*y}hdq5MpleBk3bX=F-)g@>pB1GrclE(qb~_Q7qz7an`j% z(Ysu~rF5ArqUi@8SN6)J%sL5HwGLXtXmUeqs#jC{!`!7kjM%!t!MNJ0pxcRMBX%*j z)MlF_Ma)jii;gy#g+_?qiyLk+PQdx86HCvYwh*JAMsFs?#ZCutdBV7&5;`tgpQ(GS zTZ}>vq+MKCXeX28vj@H(SWhuDIE);uednuLJh0O%rY#AtW9R{i%U57t=Rqr0aG~1T z3S|B7G!r~Mc6Gk0K)&iF?MS|jBp`+wRyR+pCEOQv+la;mp0=p3Ai$Z-d&i?R5T>@L z^j#%@EpC2Id;r|b?T)+fW6*E^E4c~32i7U3;2*mhP@!nF?JkS8wE=))Vaf}9Le8`} z4G#?Kb&-bPxk72?A!d?ssy`wG#S3dAORruv2qHjrF>4jm<+x7!7ww!6)D^%!L}oMd z^w!iHuIY={&spyOq!Kkv(G;0J9VO3p)DP+w6BgnT0D=AeKLj;UzalI7DKoOXjl>cs zk@*uvcd)PL66>eGRFw;+dA}juX|UmZO5eFYlB<#NYfmH7=8;iS(u5!_PeCNj^_b&xj^$WE%dN1TkLB*Y3ukRr?(&1Wc(EnvlZ2FKN}581gUCbTbXl2( zeWSVO>2%;)q>(-e)vB()2*GH874U)@mePPbvpm&I*pS(yabm0ImvYJND_1C?}xeg4>vXHf&HBy5jDJg}^_dJ?-0 zRa?tfMHq^A!n3PNUctCoGI(@H=uzHCu@#I%-GVD+NB93P z2>sc({r^Jfopb+%r2mfy{rM2`ztx-n3yJ@yar{5WX#XqX{>Gu6tSp^i7FN1%0HUb1nV#^wa>y|Y0g42~XSMl0Dg9^!t$iWKmrL%OzvYe_a zIHn^Up#6T$0K-D-0F}@mJaSqHh^c@XyDj!<1FU)2!UD`>lBCXQvwSz_+v*hhJR1P` zs{P~Hj=!{C6PwLORHcot5Nx{~SPmznle^D15^F4p=EVA9j#!1Z5kh-wTcKY{ex0Nn zR@kd#^kaarvpO8_<*6~}lJlDbbk((;K=T{5uF&0LESG~$6<8+OaYeNp8~10vNG%Bw zBQCV@M%aqs32+xYQx*v!2#``2kc5Io!tzO?^7aui9dh)C^hZ&2%F};<% zl*vW5P7@Mtf#m+p*@E5mU=;C*hSyHq`t5{KPnuF^U7CR$mae!c8s{ z;T7k+xdl@)+Vb-YaRu4~v(vP)Iyx1Vh(#gx-Gyv(M_FkFhFMd66p~5ye5W3*7PPTq~bV+E0Z9xY4eQ6iu zVS{}r3`JgHuE$_C6?u^NbB0%WJsLd*i#YNj+zYH#(pVSo!~rN|W&(;kxlJ?c1acO{ z#-K&hG}GsuewB)EVVn1uHS_vVxlps7JNCX^A^IHO?iWQ%aQn2ySoQ={HSQhe7tUil zH-+}vspXdPB;AS+3(lF|N%?KOH7*%6(4MdhE@Hor{8?CMlfNGX{TCjr@SXXE0ik$n z7+?g8$X}0jqu81RK-dVqb&>*KdPkUXv(AtO6?{m)7QzfzEA(P=Vu5>I#LRGCg&aOGAcXO?1hWp`SRRXio|PNwDflP>l~2d=Vt(r&V;U{lauSV?xd`=)HzqG8@aE$CM;MPZ@F7=nPLutVkAoT zfDy#GX@K0vD~3J3o#Eqzwy&gh3OGFPZ;M&t{CROi_VDM=1ck1L;LUy}lZpc*^ezpcM}M4Z z1cEiQN466tDgo5LZ_2aV%`()kL+8Q0y_0F$qM=dICG3sd7S^>qAvJr<V$aW)F-2Yq!d5g=Om*zHu`8(wC^T?^ z>w@|se46^L%Z3*9MGADkdzjA76KabkL&L?#^51!P@WQ+m5?O6T@n7gqluqr~rG$@r z9we_rHG*ry9WK!u{+vg5)d0Km9cdQi9gT`|Vm^D+FyJcm)WPt-#x84HK;cTbYU1cT z)ZY#ciy{Yl)!o?XtZtRjre`80gzDUsY0JMl7$-a14feI$eDl>}MVP@;!E! z6DS|K2EO6_tXXXL9t5%+sw(v7ZO;>)yWZ&p)7?+$?Ajb6(5KTQS&8aQ+q;TOc0Im5TfE1`%4Nr81bS%1-0< zMoh5WSnu^`wXK$@CDNFh3hY(+;Nh`q>^wK_1(}UL%|2ML3U}50@2gUQ=c~cA-+CzU z(U`DX;&>#^MVbx@fS!GJd%%%mgZY*qXy$!+jc84tucxS?}Iyt z3V9A)Ui5FbNScSjEi?>|8#gqMUHVul^hHL?OIwbk=RsUQ3p#V#Rbj26X-|4n;JMvB ztJ<(;Y??Pt(zfwwCSqmuN}}kCUStLmcq8us>QC*pggob@H7;p^K4+9k(cggn7Z5`P zC7bfv${sMLlb4Rb7n}b{Z&-CV14!f-s z_C@zn!JoRSp_L4yfZ!RSXpyJ9af|yuhV#yl6aI3)69b&|@2Du$TH5v->&Fvu;SHAF z7w`*Q&kC5VzvoWJ7twokgP)^cl$)F2@H3WY;YBaoMR;W~9mh3Fa+$nbSwVt;qG?UXy50yt4q#o~| zmTJ3K%__k{Z@%$~Y^x9OIQ+I-ru_sEi8^W1)R`WrPpyQ@UD-M+4^v$0=O);DMt3|H zu#nr#M1f;I|}IY-dht`#pT=Rs<-1s4#_k$+IP~B|=lVz2$T1@PS$@ z)Fw1N9=60IL{~-IhKuJ$VPRG>32Vk zTlDQuttx*Q^DJua-+#cAb`GJz9+{5nrH0sBU^U&cuE^|YJ}{T;8wT(}@ACU*7vucTfH-&+dEwo%esA)2F3ss;X;d zYxZ>YR6U*ig;RzLGukAFuPr69z|MOcNwP}XYD4VXb|AaM3=J>XXxyCZ_2Oqq zII@|D#p|e#?T=z1cr9l`EwPFol}o!xL0s8JIq0Oj`P(LoAbD&l`?Z&y3uocAchLkR zwc$Kd8{AN_R0`uY@=pCS-iYww!?}F6+_*B!M7gbw_s>||*igmaiW!!0T~m1G{v6%M zNc|R6V?pxLH2qvMw}?@TvK$pgnJ*sBOf0e}qb@4om8wf2f&t`JS9N0A3@qkVzrv-y zvXG%GxIl4@Z>$%qd2pqiBJp&0r=t#5R^Q6sq`{EifFn-*BugTl-MIchgN())^yZ9K zLFC@&!g%%~0>G^{<^@h?Ps z@w%(>C?~382J})zZYnAw=SDPrYM)Z>pk19cYw?ZI!Qp$y`QlrLRy!UJQf0NJp4jh^ zQAnZHg~LPwsw-FUa>%N7@OOHBq$fUXkx%0GviLAIRak<=kgEqJ2+Sl*=;X$3t8-cO zO!bIwMq2Or$ObqQ&oP61l6TSRM&$CXQIhGftEu^ma-0^rY^2zgz zH&0tIC}W|e{p;}*u%8o+1@XM%wuSNmx;bd3(q}V5X4oD_$qMPZSDG&8)6dMZr$+;@ zs!N|$g@z4CkGU54xYpLV46QJS=g?NoSnJm$Vcw}@L)&UBx0^FOo`yi^F)I>3X^hw> z!mP#(Mi1nic`rFDo_}CX#j=tcEdIeD?WUt~#I}BR49PalaS-Cbl?;_+80~rc#-0+s zm3;2ff3EEQ+SEX#j3-M_2uBNJZ)4t|dRr;FRGMvPOZ(T?VvPX51hJkbYhq55qx}`( z!Pr+~tg3YO_VlWqBw`ih6sIUWg5cBb`U}oB5ZJ$(-oF8Vw;@3n;>}Z-YPpi`wN3|D zpX_P+^iY>$z!$zGfB0@O_DP}AyPFb7T=qhqA%98GMJ1C$1_g}2dHRS<;!xc znUZqb8I<4*_jpC|ywu92L=(b7simQ$c%F+b`EqO9{8vtRBRJT|?~yS`fp~PlHb8LR zrM*@cxjuS*thA+4TpLZ5}Ey&6V>`M)i}vyXILQRhf9 z6(v$2M+f*zoPFRDrIRa|b*ddPY%Z5@*HftJ(I2dd)y&lv$8h3`pFjMkK(*x>d9?|< z8P(+3VAp1anmRP08a8=*HLGCMQN^%`ve6~6?YFX2GHOz_ZX=r~XJcODf0{41K1+mI ztgkM8CUDj&J=z$u58=rN&f#qc{`QR!t31btUF@+@Gc|hjj*G?kZ(mk%;UA1d`)q0H zc$}8=Vhu-k;N)nTgsockUUehBPJmXRSX*AO|d(pZf_vExn zG)A`3$tL;7dyH|*9zDv}GYt4;fvAt$NcIgn65s`n-Ws8P+mm6;60nT2TZg3i);ryZASh z^A7pcccVG-J+#VsLMWq68Cr5(QF+8uNbvR#QekdPQFn)2p{~~%tU6{|XOJ^>7}6Ys z_+28;ZS+nm`GZfd7!=Wj#7?`tjbxojRWM|cpm^S0j^Sj<6l2h_I4-+~ISoyj(Z7Ub z%*7NF-2AV$h$GEtfEc5CseH~gk3(1DIC|T*k$wYrzJ34|+i>|M%2|$zu9}*?np`|} zwF7Wqy)=oJ_HL%uO%(i%&2{K{Z*uMx^MTAT(zLD%IivY6yBBoAGSunF`Zv{Xs(JK# z)A3ETorZ3L8tY(7dmXg7$L^8gp34p#?zrIYwjY|N%7sh%KE+kCsAg4r*22VRk`tU2 zY+Hr7qwqgQx`U3IS$!kx&aF@HjJ+vf%D=`3m$b#^IO|+%8;3(}m3cgt5A9gt;O_+= zQd`mVo1+g{Tn(nbj;*IE1@Pwge@}Dm2v%ycg=H5dv1uET5$^ojgPlzk#q#&yrh^F@rF^2YjuF}pJ#DUGxm4?lNVm@uL(!431%!frJ3G^>qXYF7iJfS|PFqCG z^<&ss1E9>8zE`B8l>V|i*raU(6 zW3=W-Yqef`l5tZ6hOcz!rwtx#nVzr9J2z6;+<-t~0qJ84%dIrMd&N5uVjZ}|>jh?s zTwW0Q_>723`Pg};TX*dureD;8sUTub2-mgVBT|re3^r4g>aZB~AGHdg_NFPI|Rg}S2wHlMZX^l{w20Li4 zB{Hk+(^#l?j^icU+dP$Jb#iN&Hv{|>q3@Uv<0*w!Al8D;n$hyR9{7XI^{E*+&@FJu zG-x&=e*NpHPvwd2c@S;hE@m&p50lJ(UTdJBi!!a4my-@A;;QCrn2Rrp@1>G~cDLTq zqQK6{!1i2C-zeKg(jGV!Ijw~;oO^O(aic2m-=lQ zKPU6EzfWYWNkspqM4bwe3ixTqf>RwK*QI!Lj?sJ0Fj%lXW|R_|aKvae@{*VSu8E1h zBN>**zdmuG5v%p1wmymi-9$@WcQVk;Wfqq(O@{5asJ2Fdg0g`YiYa7Ah$GUYLY!+QHP~wWu6mU z+Fr?`m7G&0ti%)Uc`GJU^(Vg#y6Zs6mRkgW7T04JWPquo-OOEMra8^&r)j0WeHbYJ z*;BUcSiVs{-x3lE4-NR3>8?nD&wl9)1>F6l9 zhV8oG7nz;Mcg|lWmx_L-2TLyWmX?BU$i%)Wl9z%A3K($W!q`pd1t{UK{fTSzE(#Kz z9Ia1n3dr@QbFxWKI@xr6wVk~+7sGv>_;{TJi%@rKe0F;@w;_vsthAd>v1%E(f8J6KrEK4Iwd@%zTMTWdc4gQJz7<~cUG%}Z$sT>a$=~UJ zhv!@hgO@}cnYWw9pwRj*jvrrb!>p-u=m>w@HErdKyw4Bs42OPW8IWvs04WvdtZ)#H zMJoK<7>#IV@i`$tvbn2(u+msjwSQ4hFOX^7KbW8WVl&XcM2UnwV#n5D;(AtPlb@ww zTU%XImtNyX;R+`!3HQ|DT~s$AVb-j?8U{6{*O5HUR-q|)tox)d=Etjo#c917dk&&l z4a0%w5youv-_^D*u^MvQZ#`s^#rNm@N}|Dr+N-cv*Ceq@0o4;4pZvC{&=0pQv~+Qy^`)jr+||? z@p#Fb;7UWEShG{6zFCamZo#COy$cxVm{4u|d5(@PQci4AQbwMsk@ty%<~3vbXBu6J zqqW92f^&P992=|5p+$KqEey;?#g%j=%tVgECCucUDPBsORT>JuEn)GZOrhk-L=F?W zLVf(75+fA^V&RW&q4@C^!&t1VJ+^KT%P``81Hd)tVg|{36WF$;KGP43B zk&#HtxC$#CWBEm_c^UL%{rWar_e z`-@??b5Ni6ramt1x$}n-Lxg(Li-=8Vh$;Ot0F?pRHW4;H>VMg(l zQH(pbvn9+GSIH}aEvHw~l|mU$MwCG1;zMFi&Q~ulNYU(4v>x3tnVLPhH*2SxfVxyB za_Sdp@#1agFv%~~aip*^)@;?iJPqattesyBR7OLzK5VAV6bRR`kT77r6Ypg%1N^h=1aN!hy6=Cd{5#v}g_ zP&-ByYvznkcU&P3%AxFBG~x(UTJISbBT~k3UmFSY^yG?#6`A+0GNO~1X4^wF)YO`< z-HO!upz>Tr`T-d%1Ji?pX(03B($Y*O7uNF~%CipC735O&zTq$S3Njs zWNMeK+Lh?SkSIo?afutHuCZ@{kx|@);qok9a#&hno+*vD2ntgx=&7w+*kBQ<&&m6# zu}sGT8w;bNCK2q=jFc9z*2O(CAKsFv-p}C}oLL05x%)R;$7O~+v9)HJ)*Vicd|N#9`k(-h0KRoktQ_(z^9>^X}D ztA3`OAq$=;F0b$S;*)s-?d%B-Oj`5xZ5VE?_ZMxy))@G*cIVZe?(V!;OG&OPF{}sK z+sJKlHat>@7co{pCRlyyO30-wL&1jn^HObZ;h_3_bYUVN<7?r}3;}bMZ1IUs;0$!% zY_wqafu%Or7gaye^X|18J-kQA4P}%I@#cQ=I{kPielO?(8E1bAq-;Xvy+k2pg?eK1 zO~)pLXRFiNF#O|36>x@~*KPPlln-79V5#eNj zN?)Y<->&rk%N6~fUw5DX%XNPLwfKL;fd4mFJm7Z!zrGIti!1xyHU2Mhe~Z!n7hn86 zoakNlrsqyGg?iG1%$$9N4rE&TpG+j6i;FJs2Mo+em0F6hlN z5T%Z|`Febo+BKgdk1N7@ppuGT^g1T8AwT|{*3nWb9`@k2l`)Ex`{2=TOETI<+498X z$7_iL!2-Qy@4KcT=c4gf>ZDkCx+&H>CvnFa$MTiH&W5C}_Pk{LGOhXyJN)x^q7f7B z+6AJ~;=GUPF+az{%yi>HGXWX~SB{Q04#S~_Hj_lY5;mQoY(AT#g0>M{aKeq+w&o%D+|gSwh34Do_M>Siwqgm)b< za5VD5tweGBLmC_HM@8(Eg=pF7OHzuro=y*BCj^GIyDD3L9}E=#K;)n8;t^dxTS`+@ zLVgLOLf%tEZKftJO<>*+s&mUWO)vH>RbLt}6FkewGOoc9yHY;w*QF~in#E|S@i?y7 zl}6h46Mp`5PaiwQa;jRC(3RNC*%p%_hT^hsG5N9UXAGWOX%8DMansE)zw?=O(|71K zHiM75lTWwt=i{GWKW#2f;G(YT&5PjUeo(qEU@*a4rX_2;kkd3eK6GTc=hl!qGip-c zx|+|@kRTI%85_`OYC7%VXzR^*3A{(4)l-NpX%GA{ecC6T2eUf9f|zoNF^S12KGCUo zS${d?{P|)100)xs?ei-;>^6y!H1hP8U%rdovjSNykU?IOE{QXjmlwq9$D&@0i{H<~ z9)enXa1Oc16DR9G&VEE`iGZHT^pU>VN!6^Zs;+}ySoTjI(!g5)`qq@A7~3gPb&W z>{g@(hIY$&^~4|6YP_6`6>gOH;Tk8-q`d6;o24jxd^+B19sYne%SU5buq`H)3h#LXdl3$ z^Krj+@6S=3kS1X@W`-Ci7*m!l<-WQOUcs={txU;?*-0)fT~ZZf+M3?UB2S9z zkdH*sPeRqlQizFn)Uh`jnq#5hGtUff#8eIvd*ZAs0P0Evg|96jIi+I;}2q13%{ur zr$R$JUR}QpWuNEk9I+jPRav0TYjxr_=nmVoHhPWVC-%AxMUh>6TKo1O^IqFEef(Ytd*95jjiLuo`99>r@<^+Y(`v$KvwTE;N=N z&kWZi6>(Lbzl)jvs)A#KJ0tf@PF90ao)*`*Gr|KN&Es26D4Ck3c841M4$XB&88jT~ zRU7IA8OACzY41$wrwI>l3|(T!!TssvspC~RFjNim$!H>>)9C}<-kAj#9Pi}FvI?bSCMXu{D;L2|O`;Gr zcAgaTNr_D4$#l>z)`kw&N}pogA%1YwZBtCqj0p2wAulJr+gU_#`6JD{*Kt3iUs^3S z(0oDs_Tx#vKF*To^-g>#RLt!L8t0fDQO9^z#Zl~fY>=gxdpY!QH$AaEj+=@>7jO2W zxPc~I^t(M1@8e<(Ba)Ft-`xF0&baIHD!Zge`{NdS8eQ(H#HniUs1S=$xxOG=f{h5?*o*qK^q8Stdrc^fJq&lP_I3t}& zzT(Voap71odX!Q$U?H1_Or_x!&X==bzLkwCwhewv%c@=Ir0VeP)pBV z*Q3>!toP~!rJIfi=*z?tfz&?67|ovrJ*UhNL+;mSJo`Oa;H87Lo7vzK>3-LC;nez& zx$;JT?9;(Ie}UG)W!?U4laS&~D>!mfYoo}nt;do)uVxJ=ul*ByOY>8*&zr8&85B1W za_z}C`zUb5hj#Us#~coi@B}CN3#{exMY&@?hWT()wWTsY3=GAg)oim$)zE-Urt&F_ zV>3=SK;u(21VH(e1;h(IJZQ>fsr6-PjZX4aV41qD!} zg(&coTj^XuQfq9=QC1k_sVKjtH=xSLv8|eN23_%;+yAoejsW7VTa*NiK~zN&P+sD-x(m?l0xAaNq!MY zMdbM@!S3{MB3cyW1=Lf9IB3+z4}-DPT{M?5{Pfs=xi=j<;3XaC3ZFHp$*~lU3>&t* zSy8Ex5ZAp!ry`OJ5mEfluE1{@|ZOX=C zwFK^Blx1&J+BPGt?I@cHsVW{+$gitwt&_>AtqbH0sj|rV`Dv*Kh(v)zT3SOoC9Z4m z?1)z;T?3P?2*ivomdC2hQkAQAxGNQ(?d0)hNR*(l&VaSIk*BVyb9bKh^Tj~cs@5OI z@*N=-yso`Ql89Mhs?V>`3o~$9?}FNO4TKG&kgb5rFKWj zn${#(v+&3KrI3(>Cdsbg1Iaj@m)qBZm@$=C0_R$4rQ9Povc?^tzbaj>jJrv^iuj&$mb-Fzz_dSpL`$Ql+7ztrq=-`aImVdB*By~aNlWlK|7;I4Hk zO1myXwJOq_nbrdLiZq0-M7qIW5I?185Rj$8;MYWbl!30QxMOn;{*7NONuxf6H{eY{ zzGhft7kXk_$gb{}q&D%HEcd8}LA2&$C@6$4f$QArorZs~p^VLhiHYz$X5|dFI?a!<#Y#51r{F^TJRmIqSg58xD5 z>TsEZ&=zUcG;yJ4(0RYtGNFOtd5!$-wcbj`Vp&6ojr7xQm>r8`C$>5CW%9H)j2X05W}2z0 z4lpKRU+aE=2Q2)tio}Jv%C21vcoRL}I#n!Z6(Z(at63N`g?FmvssEaS%-mEhq47F~ ztK~u`ro$T}wK)0G15ol93#>lPq`(tWJuM`9(Ytihl7!3yeQ6*#wIfAZT4A=!^&ykx zIg~Ftuy&Xy32Z;iFEVP$K8y&7vc8@kzap^qD7mWcGxd)Zsj_Ev^+EB-oz^a@*4sDH zUAG#knu8lHG*d8K+Pu~4&`roNc$X?bsbU0j1x$W+wQWI?$Rt&@V#oU*LFXr>y&hx> z0lLQ*dds$kSy6XSMa)*AfsQ3C3MMwM*ZVCU*O#?1HrJXmzqN?oJo7OjmyZ)GX0|b2 zO)WhWB~r7r+#Ok4Qvyxz$88O5)P5GYQP1p|&OlzM_;Q^r{=H}^l<@6 z5{Ohj|23aP_g#))3RNCmPxMB|IJ&`60QAv`I`hRfdwFmJw7NburprFDi*h5C2B&U7 zYKvXBaxQ)h50A4bR3_Y6(K;QZ>TEI0$hX7TKHu-;XxY=UG^;gv7mjf@6?8dN*YnmV zSYxCmbw)WUu88LojR3DJm!+Fpsq!{5k_YtEV+s~}QsZe8UqDpkERGcyM=yNoBqg3q zxnNK`daki#Y1seO@idN;j#bMe73Y(Dn}C9m<2#Pu0tN2IaE)BL`t%poTU)=qKE202 zAj;Bmr*!6X;%6XBAUSrgN$$yrFF)!SiRwJv3t!azX_51izCuvZoP5UU&gyh2v774M z8!R%N$CFJ>3k9~p41SK7IPn5@vR-`__UsxRh4njD)hFvZ9D^_R&vkFu@*Ha74)x_PJSNp!2MERLtB$GPvL!ilSP$mTrGj(YOG^87F?N?r}J z93y+E#!E4NU4;?kf5Rl9;ti3BgTAobIltyjYg4OOLGuifI2aid=N|W-n4CT6Ev;>f zTnQ2A>?!bP~s;0$8!+f}Ylre77*xhbKBvZB0nn5nr*l8$3J;I zul{!O+-uvdsNmd9dGXD9u^JGY;^7*sG`YI(^Yk94R@Vo`HAS$FlXQLNW={K@zE8+p zX3OYH<5lq-E!CP{nsh)zmh(uPEfHr-VM+@jgQun%9i(BACVAOaNyN*Eb$nRMaB=$& zw-(y1<$~5vccn|Om9e7s#G=QB26JmZmBN;^negpZWEj^9@g}KLx0;o2&8nZr z(DXHhKFbg|X`}b%z|JBzpt8;0&TS_ODCb#xPtaFI1WoO5yT2doSx8FyP zWmbpPvCYH`9GFGSTpy|D%9D~HXI}DzpQ|bQrUr?n`T?Ei%vGDXxkVHg?eSohGVm zd%Fl#{<#EqNFSe?w&Su-EJ+DJE{~m1U7pr^Uph8B(!gZbkJ)nKT@T27)JeclJ)?!} zQ-fm*0W}@S!6m)SSii;_Wt}j$J7Qq?bCUb@ z4<^_mC<*#Qn&N?B{9B~o=_zN#-F6eE5?9uNXA-6g>D~_h19x9kr(Ie2e%Ii3rNUD0 zm`=+1A5Fn%ec!{}UYdipsTR!De(yUUI~4_s(0!${p+@#q?|6*CLC0v{XEKsGd;Mm# zW?)Nqv}5pDBizjCK&z=b;?Lxs4Z)ehC-WY20kfgklf{~E7lOZJkiz%MjC#2wJI>RrFDP2TA>>r?_yGm$0C&KBf0!dID3Mz4^yoeObY`3i+OzBcA z<7r71ZGuw2v&Z;y;89C)*n4FDGKPqGb1PV$@LkM1$i^FP)zobB-XoI1TtefQmff4sZ}~-@nGn@o9ts*H#Mfg!@!tDP~WR~DHZPF z*e)ND&o9XFklv%>(>P$r2s$BuRN&%}Lahj%dy{$f(rm{SY@5ixS@(V{)2tev0N1Vo z0|~FB)>wcn5qWQhm*8Ww^5HlE`2Bl{z(GvH%QC_s0SOdX^H?Tq1NxxkxM?*U%fCGv znuenRe|qC}!gbN|YO;a;V&Ki69&_RMgm-(4ezw{nu3)yIu|9rxxsgMai%0w%0q;}K z`B*~g&geTEh4ucd30+@2!-Zk?x)sbASj-rxoGUyu%4lq}fFC?IIqMOgdWuRDT0b?P zsQ5sQPJI+_hgh9%rR|-&9z)KPzywJ4a7}VT;Q5O}Pt1#YqmgRNAbz^Wl+1eT;-8a( z;`I7@O?WI1JN;{)ro|{Ck!L5{qa}IdA0U6)+}DQp>AtqQ+$B&*6Q5eN%C$fsrAhbp3lPbT8**6YKhKC+68nh! zYHPSYzw}Q=)~cn&?#<5F`=*yeLw3`U;#mXjy~Uey#q?QFen;=9eg|5d!xj7^q@0QM zaPWGguk&l#U||HT)U(R0cd$Hx?2L4xc9`y?MvTRKS6R6kr>z>Ee3IN!Hz%0xPI-U0 zlY6_V$ud`D;Zc*su^*5;%5`>19|h}_bb#;mCsa`^vDq#)SCmS02cB4(AAUt*?B6B= z2Q27{l{1V&duPsu7?@*I*O5s9ud3yk|3{2adF~PZrI3$(d14S@!R#kr8ZSWoB(<<1 z9z~aJON%6zWI{{{O(1zyI!k415Z1FW<5xjqU%P1rruLUl|M189FC6jJ6|cVM*ulv` z8Rq`Qdy*LxIEF=pTj)3MbjjZlo93#^QMGyl`of_sEW5CMwWy|A(8Jp2M25=({rO6N z=KDgV4wdSycgF=y^x&B-{Tw4td(Y-T@7k~PrqY5x&=RWIwiCAzSivhb>pcUs%&K52 z`EY5GXjSVlUY_pV@@)U2ykyzhQb}=DR%W_b%@K^!LIy9XQR^Su+_9$Agep~WGlWi6 zOd{jwh6ReWqAC#}S5;&B%BmXK2)xe4lHMM0kr|F4rrBtlE^}7&$aAcV=c-D^c}9wI zgO6ZxU#F5+U47BiC*Y{YD?N75yw^rogXqF}Wej5iuJkohRLJ1#Cg0-2TmP9bt(_`2zbJ~f&ZZ;7RP3Oq z-y`7%PG#W}mg30DXm-~0O%WZ$MmGt@4Og7|oI~X6McBs5 zW0baY&C$EM-qhQgi*l=h7;mlIHHf6#=Y_^5-fZ8D{i(_U1$M4;>pKr%znF8w&T{g8 z(sQX-JdYetk<)HQ)>``2IMdDcnQ_&$>Xg$ITg0-o01N#CP32rrOunkpiKKJU)IgU+ zY7G1`s-QB=9(xvtpCYJbACv@aW-r8FFAe^B9{pA)t;4#~8{TX>4<4lr8O^>@%(JMF z$6sXiNZiB|d486PLiYkJf66FE6LRnz)pM(i_%rqwLw-$DtkM^5PYZjvGJ{Gm4Bb4@ zUW&bW8ggP76EPfjH8u2IFvwp^IZq985tJJ;(=fg*o1-IQ^@zPP_p*M|LewYDI(3nU z1qWXB$7)C=R<5TN;q`4qs!?q(<$qK!RGY7 zyUoci?l5>Oh+@(&gy>|UsM5cpt1b`co7yh#*TT_uqwgZ%E6gKaNI0rQWtpArC$sD< z{Um=%(6ySGZB(tdY#N`pUHCAGXxrZU&Th?|vp`A%hd0)YJ4HwW!hsGL?rv?1gaMrw zX)HPN;>*lTYDG#4BQ}%2Ms`}Iex5Q0on>ad=38~z4f)nwdpsG^lU3`cPv4_iqe^(A zU^(Rod|2@{UX(}2#a_+*;rmOUlLdQTuQFwj>j zSyYuBXS-THlkB1d>_yPmnbm>ibcsltro%keHq8@m8=36Yf0O17ya}yzq+|w|QY+@Ilb6)Q=$Z-PhE0 zsal1x32^d_QxD@EJK3gT<;`5R!M9)2*w3r!L9boU)RSs_xO7Iv4@rlN^#ns+lQUEM z6;Nn2()d&|@KW;(tO&Qy?<9-!rg9M-?HZbwkx_@5f;M-RUg+*Yt+X8K`JD5*)CVBU zlKv^9RJpylQCFU7<(u?)U6kJ|l^UPl)Q%SBN)xZ9XLbePsZZFe-D1d^cO+v1t)`2U z5n}$S%irb^-;e*@F?|>HcRVf^Cs#XUB;3Ei{;PxfuMTpOdxtY{XyX*{I6kT4m)ewc zgTrgG=Tt;052-_~eR8_&06|%3eK}LmK`qJElAU0INwXEz%wO+ zu|5xE!Rf-zxp8M?gEJTe!8}->9Byg(z^?^Xe7Ff>QB{f-S%B{|<2ye~=C$eJi(iq} zGU=J5qs@0uD6Yw>(8qK2|BeHk5+?bpMa6oymd6H}S-r6&?sLwTtNCI(cY{enSY`Xq zxlB6f3vNS_`HnUxRDyXeLLHqj>DF}mvyAxl2Ff4*>`s~4fVw$7)DG}j!Ih+0_q8hP zE;0%}pi{dlvSB=pfmosMm0H0Jp^6bBY2KknO+(#!E;IS8v&~!LDEz_Np06)#)dX3U z_O_w(ya%wtemtB+4f~&E-AC>*yc9kjhuq2&BL3-V-d1NCxVpLq67Rj%T5|iTQGdaA z&RyuNnIH_ZSHGr%?<7`e*6S=@)`dsokA4lQJ=|Lz*GzSs;q0o5b!QF5`H`p4#I46J z1KBJz$7)HVBv(>{u#hQ~zW~m3MLV3YoWIpSuqBhyRlmiwcDFpK>oiQF*!_yirAZ5rCLcw`rJ^F}1bM_gMP{W(F~phCz~$IbiI!m!j=?-Sa=IXM zc8l-H`HF|0Zyd`Hm99tu;h-?843HM5OcT|(LV2XoNgM*zhTydwz1vaJ@Cc>sOUY@EAPvy3wC}HaKgHzpu zogaIu%sW4u2=LdStB!O=5{s9pm(c-0DOlx-i5L)-%;X&`Jw4pv%)CRAk64nxC>(zv z1O|Fmy1Q!~M^{yX{|gWRDorlS-|>Q2lCRfDOHt7m2??DAFhg#DS+TKo`ZT_jLlNkbzC-W#(+^jOaZ0`0*nZ(4qn@p)e5=I`D4TzprS( z3<_QlIx_ICK#p>Dak<9_P+tKRpo@4v+_#979Df-Vhux9k5x@4w;$%ltQd1ik+sIsU@u9{V5gBHsCbpF_R{@VtQWKMK%CfKZr$ zN(o$c1yn$PC;%-)Y86C!7G&W5As+x0hy#NRq)S0w04f3x=%Y{r?JuALKc;&YK>8IF zT41~mR74*isDK>Ehd`YIDnJ8~Vg+$<4RM(5p8g%s2kN~X2w8yV5`_tvL-5`F8|qGGSCr#kq-_rwh zb}t)ZEr4F{1OVAgwPGJPQ(GhdwCEv z0fHz99*7F)^b}_LBT0{Z#_yD+vv`I)vBDF|JMu0v($s!>k()l3b5g`sz z-v1Z0b2W7_0`R$Q&F)*owa@=*&;TRogqZwm!LYJ%b^#{X5&txS7I?Uja5dzCG&3gl cfE}|?v$8b torch.Tensor: """ assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}" - filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") + # 使用 pathlib 处理路径,支持开发环境和打包环境 + if getattr(sys, 'frozen', False): + # 打包后的 exe 环境 + exe_dir = Path(sys.executable).parent + filters_path = exe_dir / "whisper" / "assets" / "mel_filters.npz" + else: + # 开发环境 + filters_path = Path(__file__).parent / "assets" / "mel_filters.npz" + + print(f"filters_path: {filters_path}") with np.load(filters_path, allow_pickle=False) as f: return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) diff --git a/whisper/tokenizer.py b/whisper/tokenizer.py index 2af8375..5842f1c 100644 --- a/whisper/tokenizer.py +++ b/whisper/tokenizer.py @@ -1,8 +1,10 @@ import base64 import os import string +import sys from dataclasses import dataclass, field from functools import cached_property, lru_cache +from pathlib import Path from typing import Dict, List, Optional, Tuple import tiktoken @@ -329,7 +331,16 @@ class Tokenizer: @lru_cache(maxsize=None) def get_encoding(name: str = "gpt2", num_languages: int = 99): - vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken") + # 使用 pathlib 处理路径,支持开发环境和打包环境 + if getattr(sys, 'frozen', False): + # 打包后的 exe 环境 + exe_dir = Path(sys.executable).parent + vocab_path = exe_dir / "whisper" / "assets" / f"{name}.tiktoken" + else: + # 开发环境 + vocab_path = Path(__file__).parent / "assets" / f"{name}.tiktoken" + + print(f"vocab_path: {vocab_path}") ranks = { base64.b64decode(token): int(rank) for token, rank in (line.split() for line in open(vocab_path) if line) diff --git a/whisper_service_deploy/v2w_service.py b/whisper_service_deploy/v2w_service.py new file mode 100644 index 0000000..372a460 --- /dev/null +++ b/whisper_service_deploy/v2w_service.py @@ -0,0 +1,94 @@ +import whisper +import time +import datetime +from flask import Flask, request, jsonify +import threading + +app = Flask(__name__) +app.config['JSON_AS_ASCII'] = False # 确保中文正常显示 + +# 全局变量存储模型 +model = None + + +def load_model(): + global model + print("开始加载 Whisper 模型...") + start_time = time.time() + + # 手动指定模型存储路径 + model_path = "./models" # 您可以修改为任意路径 + model = whisper.load_model("medium", download_root=model_path) + + load_time = time.time() - start_time + print(f"模型加载完成,耗时: {str(datetime.timedelta(seconds=load_time))}") + print(f"模型存储路径: {model_path}") + + +# 在应用启动时加载模型 +@app.before_request +def before_first_request(): + global model + if model is None: + print("首次请求,加载模型中...") + load_model() + + +@app.route('/transcribe', methods=['POST']) +def transcribe(): + if model is None: + return jsonify({"error": "模型尚未加载完成"}), 503 + + if 'audio' not in request.files: + return jsonify({"error": "未提供音频文件"}), 400 + + audio_file = request.files['audio'] + audio_path = f"/{audio_file.filename}" + audio_file.save(audio_path) + + # 开始转录 + start_time = time.time() + result = model.transcribe(audio_path, language="zh") + transcription_time = time.time() - start_time + + return jsonify({ + "text": result["text"], + "processing_time": transcription_time + }) + + +@app.route('/transcribe_text', methods=['POST']) +def transcribe_text(): + """返回纯文本格式的转录结果,方便命令行查看""" + if model is None: + return "模型尚未加载完成", 503 + + if 'audio' not in request.files: + return "未提供音频文件", 400 + + audio_file = request.files['audio'] + audio_path = f"/{audio_file.filename}" + audio_file.save(audio_path) + + # 开始转录 + start_time = time.time() + result = model.transcribe(audio_path, language="zh") + transcription_time = time.time() - start_time + + # 返回纯文本格式 + return f"{result['text']}\r\n处理时间: {transcription_time:.2f}秒" + + +@app.route('/health', methods=['GET']) +def health_check(): + return jsonify({ + "status": "ok", + "model_loaded": model is not None + }) + + +if __name__ == '__main__': + # 在启动应用前预先加载模型 + print("启动服务前预先加载模型...") + load_model() + app.run(host='0.0.0.0', port=5000, threaded=True) \ No newline at end of file diff --git a/whisper_service_deploy/whisper/__init__.py b/whisper_service_deploy/whisper/__init__.py new file mode 100644 index 0000000..f284ec0 --- /dev/null +++ b/whisper_service_deploy/whisper/__init__.py @@ -0,0 +1,161 @@ +import hashlib +import io +import os +import urllib +import warnings +from typing import List, Optional, Union + +import torch +from tqdm import tqdm + +from .audio import load_audio, log_mel_spectrogram, pad_or_trim +from .decoding import DecodingOptions, DecodingResult, decode, detect_language +from .model import ModelDimensions, Whisper +from .transcribe import transcribe +from .version import __version__ + +_MODELS = { + "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt", + "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt", + "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt", + "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt", + "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt", + "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt", + "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt", + "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt", + "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt", + "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", + "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt", + "large": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt", + "large-v3-turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt", + "turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt", +} + +# base85-encoded (n_layers, n_heads) boolean arrays indicating the cross-attention heads that are +# highly correlated to the word-level timing, i.e. the alignment between audio and text tokens. +_ALIGNMENT_HEADS = { + "tiny.en": b"ABzY8J1N>@0{>%R00Bk>$p{7v037`oCl~+#00", + "tiny": b"ABzY8bu8Lr0{>%RKn9Fp%m@SkK7Kt=7ytkO", + "base.en": b"ABzY8;40c<0{>%RzzG;p*o+Vo09|#PsxSZm00", + "base": b"ABzY8KQ!870{>%RzyTQH3`Q^yNP!>##QT-?_)10{>%RpeA61k&I|OI3I$65C{;;pbCHh0B{qLQ;+}v00", + "small": b"ABzY8DmU6=0{>%Rpa?J`kvJ6qF(V^F86#Xh7JUGMK}P%R7%R7}kK1fFL7w6%<-Pf*t^=N)Qr&0RR9", + "large-v1": b"ABzY8r9j$a0{>%R7#4sLmoOs{s)o3~84-RPdcFk!JR%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj", + "large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00", + "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00", + "large-v3-turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`", + "turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`", +} + + +def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]: + os.makedirs(root, exist_ok=True) + + expected_sha256 = url.split("/")[-2] + download_target = os.path.join(root, os.path.basename(url)) + + if os.path.exists(download_target) and not os.path.isfile(download_target): + raise RuntimeError(f"{download_target} exists and is not a regular file") + + if os.path.isfile(download_target): + with open(download_target, "rb") as f: + model_bytes = f.read() + if hashlib.sha256(model_bytes).hexdigest() == expected_sha256: + return model_bytes if in_memory else download_target + else: + warnings.warn( + f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file" + ) + + with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: + with tqdm( + total=int(source.info().get("Content-Length")), + ncols=80, + unit="iB", + unit_scale=True, + unit_divisor=1024, + ) as loop: + while True: + buffer = source.read(8192) + if not buffer: + break + + output.write(buffer) + loop.update(len(buffer)) + + model_bytes = open(download_target, "rb").read() + if hashlib.sha256(model_bytes).hexdigest() != expected_sha256: + raise RuntimeError( + "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model." + ) + + return model_bytes if in_memory else download_target + + +def available_models() -> List[str]: + """Returns the names of available models""" + return list(_MODELS.keys()) + + +def load_model( + name: str, + device: Optional[Union[str, torch.device]] = None, + download_root: str = None, + in_memory: bool = False, +) -> Whisper: + """ + Load a Whisper ASR model + + Parameters + ---------- + name : str + one of the official model names listed by `whisper.available_models()`, or + path to a model checkpoint containing the model dimensions and the model state_dict. + device : Union[str, torch.device] + the PyTorch device to put the model into + download_root: str + path to download the model files; by default, it uses "~/.cache/whisper" + in_memory: bool + whether to preload the model weights into host memory + + Returns + ------- + model : Whisper + The Whisper ASR model instance + """ + + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + if download_root is None: + default = os.path.join(os.path.expanduser("~"), ".cache") + download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default), "whisper") + + if name in _MODELS: + checkpoint_file = _download(_MODELS[name], download_root, in_memory) + alignment_heads = _ALIGNMENT_HEADS[name] + elif os.path.isfile(name): + checkpoint_file = open(name, "rb").read() if in_memory else name + alignment_heads = None + else: + raise RuntimeError( + f"Model {name} not found; available models = {available_models()}" + ) + + with ( + io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb") + ) as fp: + kwargs = {"weights_only": True} if torch.__version__ >= "1.13" else {} + checkpoint = torch.load(fp, map_location=device, **kwargs) + del checkpoint_file + + dims = ModelDimensions(**checkpoint["dims"]) + model = Whisper(dims) + model.load_state_dict(checkpoint["model_state_dict"]) + + if alignment_heads is not None: + model.set_alignment_heads(alignment_heads) + + return model.to(device) diff --git a/whisper_service_deploy/whisper/__main__.py b/whisper_service_deploy/whisper/__main__.py new file mode 100644 index 0000000..d14f205 --- /dev/null +++ b/whisper_service_deploy/whisper/__main__.py @@ -0,0 +1,3 @@ +from .transcribe import cli + +cli() diff --git a/whisper_service_deploy/whisper/audio.py b/whisper_service_deploy/whisper/audio.py new file mode 100644 index 0000000..826250f --- /dev/null +++ b/whisper_service_deploy/whisper/audio.py @@ -0,0 +1,157 @@ +import os +from functools import lru_cache +from subprocess import CalledProcessError, run +from typing import Optional, Union + +import numpy as np +import torch +import torch.nn.functional as F + +from .utils import exact_div + +# hard-coded audio hyperparameters +SAMPLE_RATE = 16000 +N_FFT = 400 +HOP_LENGTH = 160 +CHUNK_LENGTH = 30 +N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk +N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000 frames in a mel spectrogram input + +N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 # the initial convolutions has stride 2 +FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) # 10ms per audio frame +TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token + + +def load_audio(file: str, sr: int = SAMPLE_RATE): + """ + Open an audio file and read as mono waveform, resampling as necessary + + Parameters + ---------- + file: str + The audio file to open + + sr: int + The sample rate to resample the audio if necessary + + Returns + ------- + A NumPy array containing the audio waveform, in float32 dtype. + """ + + # This launches a subprocess to decode audio while down-mixing + # and resampling as necessary. Requires the ffmpeg CLI in PATH. + # fmt: off + cmd = [ + "ffmpeg", + "-nostdin", + "-threads", "0", + "-i", file, + "-f", "s16le", + "-ac", "1", + "-acodec", "pcm_s16le", + "-ar", str(sr), + "-" + ] + # fmt: on + try: + out = run(cmd, capture_output=True, check=True).stdout + except CalledProcessError as e: + raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e + + return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 + + +def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1): + """ + Pad or trim the audio array to N_SAMPLES, as expected by the encoder. + """ + if torch.is_tensor(array): + if array.shape[axis] > length: + array = array.index_select( + dim=axis, index=torch.arange(length, device=array.device) + ) + + if array.shape[axis] < length: + pad_widths = [(0, 0)] * array.ndim + pad_widths[axis] = (0, length - array.shape[axis]) + array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes]) + else: + if array.shape[axis] > length: + array = array.take(indices=range(length), axis=axis) + + if array.shape[axis] < length: + pad_widths = [(0, 0)] * array.ndim + pad_widths[axis] = (0, length - array.shape[axis]) + array = np.pad(array, pad_widths) + + return array + + +@lru_cache(maxsize=None) +def mel_filters(device, n_mels: int) -> torch.Tensor: + """ + load the mel filterbank matrix for projecting STFT into a Mel spectrogram. + Allows decoupling librosa dependency; saved using: + + np.savez_compressed( + "mel_filters.npz", + mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), + mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128), + ) + """ + assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}" + + filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") + with np.load(filters_path, allow_pickle=False) as f: + return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) + + +def log_mel_spectrogram( + audio: Union[str, np.ndarray, torch.Tensor], + n_mels: int = 80, + padding: int = 0, + device: Optional[Union[str, torch.device]] = None, +): + """ + Compute the log-Mel spectrogram of + + Parameters + ---------- + audio: Union[str, np.ndarray, torch.Tensor], shape = (*) + The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz + + n_mels: int + The number of Mel-frequency filters, only 80 and 128 are supported + + padding: int + Number of zero samples to pad to the right + + device: Optional[Union[str, torch.device]] + If given, the audio tensor is moved to this device before STFT + + Returns + ------- + torch.Tensor, shape = (n_mels, n_frames) + A Tensor that contains the Mel spectrogram + """ + if not torch.is_tensor(audio): + if isinstance(audio, str): + audio = load_audio(audio) + audio = torch.from_numpy(audio) + + if device is not None: + audio = audio.to(device) + if padding > 0: + audio = F.pad(audio, (0, padding)) + window = torch.hann_window(N_FFT).to(audio.device) + stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True) + magnitudes = stft[..., :-1].abs() ** 2 + + filters = mel_filters(audio.device, n_mels) + mel_spec = filters @ magnitudes + + log_spec = torch.clamp(mel_spec, min=1e-10).log10() + log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) + log_spec = (log_spec + 4.0) / 4.0 + return log_spec diff --git a/whisper_service_deploy/whisper/decoding.py b/whisper_service_deploy/whisper/decoding.py new file mode 100644 index 0000000..49485d0 --- /dev/null +++ b/whisper_service_deploy/whisper/decoding.py @@ -0,0 +1,826 @@ +from dataclasses import dataclass, field, replace +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor +from torch.distributions import Categorical + +from .audio import CHUNK_LENGTH +from .tokenizer import Tokenizer, get_tokenizer +from .utils import compression_ratio + +if TYPE_CHECKING: + from .model import Whisper + + +@torch.no_grad() +def detect_language( + model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None +) -> Tuple[Tensor, List[dict]]: + """ + Detect the spoken language in the audio, and return them as list of strings, along with the ids + of the most probable language tokens and the probability distribution over all language tokens. + This is performed outside the main decode loop in order to not interfere with kv-caching. + + Returns + ------- + language_tokens : Tensor, shape = (n_audio,) + ids of the most probable language tokens, which appears after the startoftranscript token. + language_probs : List[Dict[str, float]], length = n_audio + list of dictionaries containing the probability distribution over all languages. + """ + if tokenizer is None: + tokenizer = get_tokenizer( + model.is_multilingual, num_languages=model.num_languages + ) + if ( + tokenizer.language is None + or tokenizer.language_token not in tokenizer.sot_sequence + ): + raise ValueError( + "This model doesn't have language tokens so it can't perform lang id" + ) + + single = mel.ndim == 2 + if single: + mel = mel.unsqueeze(0) + + # skip encoder forward pass if already-encoded audio features were given + if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state): + mel = model.encoder(mel) + + # forward pass using a single token, startoftranscript + n_audio = mel.shape[0] + x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device) # [n_audio, 1] + logits = model.logits(x, mel)[:, 0] + + # collect detected languages; suppress all non-language tokens + mask = torch.ones(logits.shape[-1], dtype=torch.bool) + mask[list(tokenizer.all_language_tokens)] = False + logits[:, mask] = -np.inf + language_tokens = logits.argmax(dim=-1) + language_token_probs = logits.softmax(dim=-1).cpu() + language_probs = [ + { + c: language_token_probs[i, j].item() + for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes) + } + for i in range(n_audio) + ] + + if single: + language_tokens = language_tokens[0] + language_probs = language_probs[0] + + return language_tokens, language_probs + + +@dataclass(frozen=True) +class DecodingOptions: + # whether to perform X->X "transcribe" or X->English "translate" + task: str = "transcribe" + + # language that the audio is in; uses detected language if None + language: Optional[str] = None + + # sampling-related options + temperature: float = 0.0 + sample_len: Optional[int] = None # maximum number of tokens to sample + best_of: Optional[int] = None # number of independent sample trajectories, if t > 0 + beam_size: Optional[int] = None # number of beams in beam search, if t == 0 + patience: Optional[float] = None # patience in beam search (arxiv:2204.05424) + + # "alpha" in Google NMT, or None for length norm, when ranking generations + # to select which to return among the beams or best-of-N samples + length_penalty: Optional[float] = None + + # text or tokens to feed as the prompt or the prefix; for more info: + # https://github.com/openai/whisper/discussions/117#discussioncomment-3727051 + prompt: Optional[Union[str, List[int]]] = None # for the previous context + prefix: Optional[Union[str, List[int]]] = None # to prefix the current context + + # list of tokens ids (or comma-separated token ids) to suppress + # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()` + suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1" + suppress_blank: bool = True # this will suppress blank outputs + + # timestamp sampling options + without_timestamps: bool = False # use <|notimestamps|> to sample text tokens only + max_initial_timestamp: Optional[float] = 1.0 + + # implementation details + fp16: bool = True # use fp16 for most of the calculation + + +@dataclass(frozen=True) +class DecodingResult: + audio_features: Tensor + language: str + language_probs: Optional[Dict[str, float]] = None + tokens: List[int] = field(default_factory=list) + text: str = "" + avg_logprob: float = np.nan + no_speech_prob: float = np.nan + temperature: float = np.nan + compression_ratio: float = np.nan + + +class Inference: + def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor: + """Perform a forward pass on the decoder and return per-token logits""" + raise NotImplementedError + + def rearrange_kv_cache(self, source_indices) -> None: + """Update the key-value cache according to the updated beams""" + raise NotImplementedError + + def cleanup_caching(self) -> None: + """Clean up any resources or hooks after decoding is finished""" + pass + + +class PyTorchInference(Inference): + def __init__(self, model: "Whisper", initial_token_length: int): + self.model: "Whisper" = model + self.initial_token_length = initial_token_length + self.kv_cache = {} + self.hooks = [] + + key_modules = [block.attn.key for block in self.model.decoder.blocks] + value_modules = [block.attn.value for block in self.model.decoder.blocks] + self.kv_modules = key_modules + value_modules + + def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor: + if not self.kv_cache: + self.kv_cache, self.hooks = self.model.install_kv_cache_hooks() + + if tokens.shape[-1] > self.initial_token_length: + # only need to use the last token except in the first forward pass + tokens = tokens[:, -1:] + + return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache) + + def cleanup_caching(self): + for hook in self.hooks: + hook.remove() + + self.kv_cache = {} + self.hooks = [] + + def rearrange_kv_cache(self, source_indices): + if source_indices != list(range(len(source_indices))): + for module in self.kv_modules: + # update the key/value cache to contain the selected sequences + self.kv_cache[module] = self.kv_cache[module][source_indices].detach() + + +class SequenceRanker: + def rank( + self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]] + ) -> List[int]: + """ + Given a list of groups of samples and their cumulative log probabilities, + return the indices of the samples in each group to select as the final result + """ + raise NotImplementedError + + +class MaximumLikelihoodRanker(SequenceRanker): + """ + Select the sample with the highest log probabilities, penalized using either + a simple length normalization or Google NMT paper's length penalty + """ + + def __init__(self, length_penalty: Optional[float]): + self.length_penalty = length_penalty + + def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]): + def scores(logprobs, lengths): + result = [] + for logprob, length in zip(logprobs, lengths): + if self.length_penalty is None: + penalty = length + else: + # from the Google NMT paper + penalty = ((5 + length) / 6) ** self.length_penalty + result.append(logprob / penalty) + return result + + # get the sequence with the highest score + lengths = [[len(t) for t in s] for s in tokens] + return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)] + + +class TokenDecoder: + def reset(self): + """Initialize any stateful variables for decoding a new sequence""" + + def update( + self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor + ) -> Tuple[Tensor, bool]: + """Specify how to select the next token, based on the current trace and logits + + Parameters + ---------- + tokens : Tensor, shape = (n_batch, current_sequence_length) + all tokens in the context so far, including the prefix and sot_sequence tokens + + logits : Tensor, shape = (n_batch, vocab_size) + per-token logits of the probability distribution at the current step + + sum_logprobs : Tensor, shape = (n_batch) + cumulative log probabilities for each sequence + + Returns + ------- + tokens : Tensor, shape = (n_batch, current_sequence_length + 1) + the tokens, appended with the selected next token + + completed : bool + True if all sequences has reached the end of text + + """ + raise NotImplementedError + + def finalize( + self, tokens: Tensor, sum_logprobs: Tensor + ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]: + """Finalize search and return the final candidate sequences + + Parameters + ---------- + tokens : Tensor, shape = (n_audio, n_group, current_sequence_length) + all tokens in the context so far, including the prefix and sot_sequence + + sum_logprobs : Tensor, shape = (n_audio, n_group) + cumulative log probabilities for each sequence + + Returns + ------- + tokens : Sequence[Sequence[Tensor]], length = n_audio + sequence of Tensors containing candidate token sequences, for each audio input + + sum_logprobs : List[List[float]], length = n_audio + sequence of cumulative log probabilities corresponding to the above + + """ + raise NotImplementedError + + +class GreedyDecoder(TokenDecoder): + def __init__(self, temperature: float, eot: int): + self.temperature = temperature + self.eot = eot + + def update( + self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor + ) -> Tuple[Tensor, bool]: + if self.temperature == 0: + next_tokens = logits.argmax(dim=-1) + else: + next_tokens = Categorical(logits=logits / self.temperature).sample() + + logprobs = F.log_softmax(logits.float(), dim=-1) + current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens] + sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot) + + next_tokens[tokens[:, -1] == self.eot] = self.eot + tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1) + + completed = (tokens[:, -1] == self.eot).all() + return tokens, completed + + def finalize(self, tokens: Tensor, sum_logprobs: Tensor): + # make sure each sequence has at least one EOT token at the end + tokens = F.pad(tokens, (0, 1), value=self.eot) + return tokens, sum_logprobs.tolist() + + +class BeamSearchDecoder(TokenDecoder): + def __init__( + self, + beam_size: int, + eot: int, + inference: Inference, + patience: Optional[float] = None, + ): + self.beam_size = beam_size + self.eot = eot + self.inference = inference + self.patience = patience or 1.0 + self.max_candidates: int = round(beam_size * self.patience) + self.finished_sequences = None + + assert ( + self.max_candidates > 0 + ), f"Invalid beam size ({beam_size}) or patience ({patience})" + + def reset(self): + self.finished_sequences = None + + def update( + self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor + ) -> Tuple[Tensor, bool]: + if tokens.shape[0] % self.beam_size != 0: + raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0") + + n_audio = tokens.shape[0] // self.beam_size + if self.finished_sequences is None: # for the first update + self.finished_sequences = [{} for _ in range(n_audio)] + + logprobs = F.log_softmax(logits.float(), dim=-1) + next_tokens, source_indices, finished_sequences = [], [], [] + for i in range(n_audio): + scores, sources, finished = {}, {}, {} + + # STEP 1: calculate the cumulative log probabilities for possible candidates + for j in range(self.beam_size): + idx = i * self.beam_size + j + prefix = tokens[idx].tolist() + for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)): + new_logprob = (sum_logprobs[idx] + logprob).item() + sequence = tuple(prefix + [token.item()]) + scores[sequence] = new_logprob + sources[sequence] = idx + + # STEP 2: rank the candidates and keep the top beam_size sequences for each audio + saved = 0 + for sequence in sorted(scores, key=scores.get, reverse=True): + if sequence[-1] == self.eot: + finished[sequence] = scores[sequence] + else: + sum_logprobs[len(next_tokens)] = scores[sequence] + next_tokens.append(sequence) + source_indices.append(sources[sequence]) + + saved += 1 + if saved == self.beam_size: + break + + finished_sequences.append(finished) + + tokens = torch.tensor(next_tokens, device=tokens.device) + self.inference.rearrange_kv_cache(source_indices) + + # add newly finished sequences to self.finished_sequences + assert len(self.finished_sequences) == len(finished_sequences) + for previously_finished, newly_finished in zip( + self.finished_sequences, finished_sequences + ): + for seq in sorted(newly_finished, key=newly_finished.get, reverse=True): + if len(previously_finished) >= self.max_candidates: + break # the candidate list is full + previously_finished[seq] = newly_finished[seq] + + # mark as completed if all audio has enough number of samples + completed = all( + len(sequences) >= self.max_candidates + for sequences in self.finished_sequences + ) + return tokens, completed + + def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor): + # collect all finished sequences, including patience, and add unfinished ones if not enough + sum_logprobs = sum_logprobs.cpu() + for i, sequences in enumerate(self.finished_sequences): + if ( + len(sequences) < self.beam_size + ): # when not enough sequences are finished + for j in list(np.argsort(sum_logprobs[i]))[::-1]: + sequence = preceding_tokens[i, j].tolist() + [self.eot] + sequences[tuple(sequence)] = sum_logprobs[i][j].item() + if len(sequences) >= self.beam_size: + break + + tokens: List[List[Tensor]] = [ + [torch.tensor(seq) for seq in sequences.keys()] + for sequences in self.finished_sequences + ] + sum_logprobs: List[List[float]] = [ + list(sequences.values()) for sequences in self.finished_sequences + ] + return tokens, sum_logprobs + + +class LogitFilter: + def apply(self, logits: Tensor, tokens: Tensor) -> None: + """Apply any filtering or masking to logits in-place + + Parameters + ---------- + logits : Tensor, shape = (n_batch, vocab_size) + per-token logits of the probability distribution at the current step + + tokens : Tensor, shape = (n_batch, current_sequence_length) + all tokens in the context so far, including the prefix and sot_sequence tokens + + """ + raise NotImplementedError + + +class SuppressBlank(LogitFilter): + def __init__(self, tokenizer: Tokenizer, sample_begin: int): + self.tokenizer = tokenizer + self.sample_begin = sample_begin + + def apply(self, logits: Tensor, tokens: Tensor): + if tokens.shape[1] == self.sample_begin: + logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf + + +class SuppressTokens(LogitFilter): + def __init__(self, suppress_tokens: Sequence[int]): + self.suppress_tokens = list(suppress_tokens) + + def apply(self, logits: Tensor, tokens: Tensor): + logits[:, self.suppress_tokens] = -np.inf + + +class ApplyTimestampRules(LogitFilter): + def __init__( + self, + tokenizer: Tokenizer, + sample_begin: int, + max_initial_timestamp_index: Optional[int], + ): + self.tokenizer = tokenizer + self.sample_begin = sample_begin + self.max_initial_timestamp_index = max_initial_timestamp_index + + def apply(self, logits: Tensor, tokens: Tensor): + # suppress <|notimestamps|> which is handled by without_timestamps + if self.tokenizer.no_timestamps is not None: + logits[:, self.tokenizer.no_timestamps] = -np.inf + + # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly + for k in range(tokens.shape[0]): + sampled_tokens = tokens[k, self.sample_begin :] + seq = [t for t in sampled_tokens.tolist()] + last_was_timestamp = ( + len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin + ) + penultimate_was_timestamp = ( + len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin + ) + + if last_was_timestamp: + if penultimate_was_timestamp: # has to be non-timestamp + logits[k, self.tokenizer.timestamp_begin :] = -np.inf + else: # cannot be normal text tokens + logits[k, : self.tokenizer.eot] = -np.inf + + timestamps = sampled_tokens[ + sampled_tokens.ge(self.tokenizer.timestamp_begin) + ] + if timestamps.numel() > 0: + # timestamps shouldn't decrease; forbid timestamp tokens smaller than the last + # also force each segment to have a nonzero length, to prevent infinite looping + if last_was_timestamp and not penultimate_was_timestamp: + timestamp_last = timestamps[-1] + else: + timestamp_last = timestamps[-1] + 1 + logits[k, self.tokenizer.timestamp_begin : timestamp_last] = -np.inf + + if tokens.shape[1] == self.sample_begin: + # suppress generating non-timestamp tokens at the beginning + logits[:, : self.tokenizer.timestamp_begin] = -np.inf + + # apply the `max_initial_timestamp` option + if self.max_initial_timestamp_index is not None: + last_allowed = ( + self.tokenizer.timestamp_begin + self.max_initial_timestamp_index + ) + logits[:, last_allowed + 1 :] = -np.inf + + # if sum of probability over timestamps is above any other token, sample timestamp + logprobs = F.log_softmax(logits.float(), dim=-1) + for k in range(tokens.shape[0]): + timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp( + dim=-1 + ) + max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max() + if timestamp_logprob > max_text_token_logprob: + logits[k, : self.tokenizer.timestamp_begin] = -np.inf + + +class DecodingTask: + inference: Inference + sequence_ranker: SequenceRanker + decoder: TokenDecoder + logit_filters: List[LogitFilter] + + def __init__(self, model: "Whisper", options: DecodingOptions): + self.model = model + + language = options.language or "en" + tokenizer = get_tokenizer( + model.is_multilingual, + num_languages=model.num_languages, + language=language, + task=options.task, + ) + self.tokenizer: Tokenizer = tokenizer + self.options: DecodingOptions = self._verify_options(options) + + self.n_group: int = options.beam_size or options.best_of or 1 + self.n_ctx: int = model.dims.n_text_ctx + self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2 + + self.sot_sequence: Tuple[int] = tokenizer.sot_sequence + if self.options.without_timestamps: + self.sot_sequence = tokenizer.sot_sequence_including_notimestamps + + self.initial_tokens: Tuple[int] = self._get_initial_tokens() + self.sample_begin: int = len(self.initial_tokens) + self.sot_index: int = self.initial_tokens.index(tokenizer.sot) + + # inference: implements the forward pass through the decoder, including kv caching + self.inference = PyTorchInference(model, len(self.initial_tokens)) + + # sequence ranker: implements how to rank a group of sampled sequences + self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty) + + # decoder: implements how to select the next tokens, given the autoregressive distribution + if options.beam_size is not None: + self.decoder = BeamSearchDecoder( + options.beam_size, tokenizer.eot, self.inference, options.patience + ) + else: + self.decoder = GreedyDecoder(options.temperature, tokenizer.eot) + + # logit filters: applies various rules to suppress or penalize certain tokens + self.logit_filters = [] + if self.options.suppress_blank: + self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin)) + if self.options.suppress_tokens: + self.logit_filters.append(SuppressTokens(self._get_suppress_tokens())) + if not options.without_timestamps: + precision = CHUNK_LENGTH / model.dims.n_audio_ctx # usually 0.02 seconds + max_initial_timestamp_index = None + if options.max_initial_timestamp: + max_initial_timestamp_index = round( + self.options.max_initial_timestamp / precision + ) + self.logit_filters.append( + ApplyTimestampRules( + tokenizer, self.sample_begin, max_initial_timestamp_index + ) + ) + + def _verify_options(self, options: DecodingOptions) -> DecodingOptions: + if options.beam_size is not None and options.best_of is not None: + raise ValueError("beam_size and best_of can't be given together") + if options.temperature == 0: + if options.best_of is not None: + raise ValueError("best_of with greedy sampling (T=0) is not compatible") + if options.patience is not None and options.beam_size is None: + raise ValueError("patience requires beam_size to be given") + if options.length_penalty is not None and not ( + 0 <= options.length_penalty <= 1 + ): + raise ValueError("length_penalty (alpha) should be a value between 0 and 1") + + return options + + def _get_initial_tokens(self) -> Tuple[int]: + tokens = list(self.sot_sequence) + + if prefix := self.options.prefix: + prefix_tokens = ( + self.tokenizer.encode(" " + prefix.strip()) + if isinstance(prefix, str) + else prefix + ) + if self.sample_len is not None: + max_prefix_len = self.n_ctx // 2 - self.sample_len + prefix_tokens = prefix_tokens[-max_prefix_len:] + tokens = tokens + prefix_tokens + + if prompt := self.options.prompt: + prompt_tokens = ( + self.tokenizer.encode(" " + prompt.strip()) + if isinstance(prompt, str) + else prompt + ) + tokens = ( + [self.tokenizer.sot_prev] + + prompt_tokens[-(self.n_ctx // 2 - 1) :] + + tokens + ) + + return tuple(tokens) + + def _get_suppress_tokens(self) -> Tuple[int]: + suppress_tokens = self.options.suppress_tokens + + if isinstance(suppress_tokens, str): + suppress_tokens = [int(t) for t in suppress_tokens.split(",")] + + if -1 in suppress_tokens: + suppress_tokens = [t for t in suppress_tokens if t >= 0] + suppress_tokens.extend(self.tokenizer.non_speech_tokens) + elif suppress_tokens is None or len(suppress_tokens) == 0: + suppress_tokens = [] # interpret empty string as an empty list + else: + assert isinstance(suppress_tokens, list), "suppress_tokens must be a list" + + suppress_tokens.extend( + [ + self.tokenizer.transcribe, + self.tokenizer.translate, + self.tokenizer.sot, + self.tokenizer.sot_prev, + self.tokenizer.sot_lm, + ] + ) + if self.tokenizer.no_speech is not None: + # no-speech probability is collected separately + suppress_tokens.append(self.tokenizer.no_speech) + + return tuple(sorted(set(suppress_tokens))) + + def _get_audio_features(self, mel: Tensor): + if self.options.fp16: + mel = mel.half() + + if mel.shape[-2:] == ( + self.model.dims.n_audio_ctx, + self.model.dims.n_audio_state, + ): + # encoded audio features are given; skip audio encoding + audio_features = mel + else: + audio_features = self.model.encoder(mel) + + if audio_features.dtype != ( + torch.float16 if self.options.fp16 else torch.float32 + ): + return TypeError( + f"audio_features has an incorrect dtype: {audio_features.dtype}" + ) + + return audio_features + + def _detect_language(self, audio_features: Tensor, tokens: Tensor): + languages = [self.options.language] * audio_features.shape[0] + lang_probs = None + + if self.options.language is None or self.options.task == "lang_id": + lang_tokens, lang_probs = self.model.detect_language( + audio_features, self.tokenizer + ) + languages = [max(probs, key=probs.get) for probs in lang_probs] + if self.options.language is None: + tokens[:, self.sot_index + 1] = lang_tokens # write language tokens + + return languages, lang_probs + + def _main_loop(self, audio_features: Tensor, tokens: Tensor): + n_batch = tokens.shape[0] + sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device) + no_speech_probs = [np.nan] * n_batch + + try: + for i in range(self.sample_len): + logits = self.inference.logits(tokens, audio_features) + + if ( + i == 0 and self.tokenizer.no_speech is not None + ): # save no_speech_probs + probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1) + no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist() + + # now we need to consider the logits at the last token only + logits = logits[:, -1] + + # apply the logit filters, e.g. for suppressing or applying penalty to + for logit_filter in self.logit_filters: + logit_filter.apply(logits, tokens) + + # expand the tokens tensor with the selected next tokens + tokens, completed = self.decoder.update(tokens, logits, sum_logprobs) + + if completed or tokens.shape[-1] > self.n_ctx: + break + finally: + self.inference.cleanup_caching() + + return tokens, sum_logprobs, no_speech_probs + + @torch.no_grad() + def run(self, mel: Tensor) -> List[DecodingResult]: + self.decoder.reset() + tokenizer: Tokenizer = self.tokenizer + n_audio: int = mel.shape[0] + + audio_features: Tensor = self._get_audio_features(mel) # encoder forward pass + tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1) + + # detect language if requested, overwriting the language token + languages, language_probs = self._detect_language(audio_features, tokens) + if self.options.task == "lang_id": + return [ + DecodingResult( + audio_features=features, language=language, language_probs=probs + ) + for features, language, probs in zip( + audio_features, languages, language_probs + ) + ] + + # repeat text tensors by the group size, for beam search or best-of-n sampling + tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device) + + # call the main sampling loop + tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens) + + # reshape the tensors to have (n_audio, n_group) as the first two dimensions + audio_features = audio_features[:: self.n_group] + no_speech_probs = no_speech_probs[:: self.n_group] + assert audio_features.shape[0] == len(no_speech_probs) == n_audio + + tokens = tokens.reshape(n_audio, self.n_group, -1) + sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group) + + # get the final candidates for each group, and slice between the first sampled token and EOT + tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs) + tokens: List[List[Tensor]] = [ + [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] + for s in tokens + ] + + # select the top-ranked sample in each group + selected = self.sequence_ranker.rank(tokens, sum_logprobs) + tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)] + texts: List[str] = [tokenizer.decode(t).strip() for t in tokens] + + sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)] + avg_logprobs: List[float] = [ + lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs) + ] + + fields = ( + texts, + languages, + tokens, + audio_features, + avg_logprobs, + no_speech_probs, + ) + if len(set(map(len, fields))) != 1: + raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}") + + return [ + DecodingResult( + audio_features=features, + language=language, + tokens=tokens, + text=text, + avg_logprob=avg_logprob, + no_speech_prob=no_speech_prob, + temperature=self.options.temperature, + compression_ratio=compression_ratio(text), + ) + for text, language, tokens, features, avg_logprob, no_speech_prob in zip( + *fields + ) + ] + + +@torch.no_grad() +def decode( + model: "Whisper", + mel: Tensor, + options: DecodingOptions = DecodingOptions(), + **kwargs, +) -> Union[DecodingResult, List[DecodingResult]]: + """ + Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s). + + Parameters + ---------- + model: Whisper + the Whisper model instance + + mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000) + A tensor containing the Mel spectrogram(s) + + options: DecodingOptions + A dataclass that contains all necessary options for decoding 30-second segments + + Returns + ------- + result: Union[DecodingResult, List[DecodingResult]] + The result(s) of decoding contained in `DecodingResult` dataclass instance(s) + """ + if single := mel.ndim == 2: + mel = mel.unsqueeze(0) + + if kwargs: + options = replace(options, **kwargs) + + result = DecodingTask(model, options).run(mel) + + return result[0] if single else result diff --git a/whisper_service_deploy/whisper/model.py b/whisper_service_deploy/whisper/model.py new file mode 100644 index 0000000..e537447 --- /dev/null +++ b/whisper_service_deploy/whisper/model.py @@ -0,0 +1,345 @@ +import base64 +import gzip +from contextlib import contextmanager +from dataclasses import dataclass +from typing import Dict, Iterable, Optional, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from .decoding import decode as decode_function +from .decoding import detect_language as detect_language_function +from .transcribe import transcribe as transcribe_function + +try: + from torch.nn.functional import scaled_dot_product_attention + + SDPA_AVAILABLE = True +except (ImportError, RuntimeError, OSError): + scaled_dot_product_attention = None + SDPA_AVAILABLE = False + + +@dataclass +class ModelDimensions: + n_mels: int + n_audio_ctx: int + n_audio_state: int + n_audio_head: int + n_audio_layer: int + n_vocab: int + n_text_ctx: int + n_text_state: int + n_text_head: int + n_text_layer: int + + +class LayerNorm(nn.LayerNorm): + def forward(self, x: Tensor) -> Tensor: + return super().forward(x.float()).type(x.dtype) + + +class Linear(nn.Linear): + def forward(self, x: Tensor) -> Tensor: + return F.linear( + x, + self.weight.to(x.dtype), + None if self.bias is None else self.bias.to(x.dtype), + ) + + +class Conv1d(nn.Conv1d): + def _conv_forward( + self, x: Tensor, weight: Tensor, bias: Optional[Tensor] + ) -> Tensor: + return super()._conv_forward( + x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype) + ) + + +def sinusoids(length, channels, max_timescale=10000): + """Returns sinusoids for positional embedding""" + assert channels % 2 == 0 + log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1) + inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2)) + scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :] + return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1) + + +@contextmanager +def disable_sdpa(): + prev_state = MultiHeadAttention.use_sdpa + try: + MultiHeadAttention.use_sdpa = False + yield + finally: + MultiHeadAttention.use_sdpa = prev_state + + +class MultiHeadAttention(nn.Module): + use_sdpa = True + + def __init__(self, n_state: int, n_head: int): + super().__init__() + self.n_head = n_head + self.query = Linear(n_state, n_state) + self.key = Linear(n_state, n_state, bias=False) + self.value = Linear(n_state, n_state) + self.out = Linear(n_state, n_state) + + def forward( + self, + x: Tensor, + xa: Optional[Tensor] = None, + mask: Optional[Tensor] = None, + kv_cache: Optional[dict] = None, + ): + q = self.query(x) + + if kv_cache is None or xa is None or self.key not in kv_cache: + # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors; + # otherwise, perform key/value projections for self- or cross-attention as usual. + k = self.key(x if xa is None else xa) + v = self.value(x if xa is None else xa) + else: + # for cross-attention, calculate keys and values once and reuse in subsequent calls. + k = kv_cache[self.key] + v = kv_cache[self.value] + + wv, qk = self.qkv_attention(q, k, v, mask) + return self.out(wv), qk + + def qkv_attention( + self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + n_batch, n_ctx, n_state = q.shape + scale = (n_state // self.n_head) ** -0.25 + q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + + if SDPA_AVAILABLE and MultiHeadAttention.use_sdpa: + a = scaled_dot_product_attention( + q, k, v, is_causal=mask is not None and n_ctx > 1 + ) + out = a.permute(0, 2, 1, 3).flatten(start_dim=2) + qk = None + else: + qk = (q * scale) @ (k * scale).transpose(-1, -2) + if mask is not None: + qk = qk + mask[:n_ctx, :n_ctx] + qk = qk.float() + + w = F.softmax(qk, dim=-1).to(q.dtype) + out = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2) + qk = qk.detach() + + return out, qk + + +class ResidualAttentionBlock(nn.Module): + def __init__(self, n_state: int, n_head: int, cross_attention: bool = False): + super().__init__() + + self.attn = MultiHeadAttention(n_state, n_head) + self.attn_ln = LayerNorm(n_state) + + self.cross_attn = ( + MultiHeadAttention(n_state, n_head) if cross_attention else None + ) + self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None + + n_mlp = n_state * 4 + self.mlp = nn.Sequential( + Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state) + ) + self.mlp_ln = LayerNorm(n_state) + + def forward( + self, + x: Tensor, + xa: Optional[Tensor] = None, + mask: Optional[Tensor] = None, + kv_cache: Optional[dict] = None, + ): + x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0] + if self.cross_attn: + x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0] + x = x + self.mlp(self.mlp_ln(x)) + return x + + +class AudioEncoder(nn.Module): + def __init__( + self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int + ): + super().__init__() + self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1) + self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1) + self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state)) + + self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList( + [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)] + ) + self.ln_post = LayerNorm(n_state) + + def forward(self, x: Tensor): + """ + x : torch.Tensor, shape = (batch_size, n_mels, n_ctx) + the mel spectrogram of the audio + """ + x = F.gelu(self.conv1(x)) + x = F.gelu(self.conv2(x)) + x = x.permute(0, 2, 1) + + assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape" + x = (x + self.positional_embedding).to(x.dtype) + + for block in self.blocks: + x = block(x) + + x = self.ln_post(x) + return x + + +class TextDecoder(nn.Module): + def __init__( + self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int + ): + super().__init__() + + self.token_embedding = nn.Embedding(n_vocab, n_state) + self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state)) + + self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList( + [ + ResidualAttentionBlock(n_state, n_head, cross_attention=True) + for _ in range(n_layer) + ] + ) + self.ln = LayerNorm(n_state) + + mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1) + self.register_buffer("mask", mask, persistent=False) + + def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None): + """ + x : torch.LongTensor, shape = (batch_size, <= n_ctx) + the text tokens + xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state) + the encoded audio features to be attended on + """ + offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 + x = ( + self.token_embedding(x) + + self.positional_embedding[offset : offset + x.shape[-1]] + ) + x = x.to(xa.dtype) + + for block in self.blocks: + x = block(x, xa, mask=self.mask, kv_cache=kv_cache) + + x = self.ln(x) + logits = ( + x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1) + ).float() + + return logits + + +class Whisper(nn.Module): + def __init__(self, dims: ModelDimensions): + super().__init__() + self.dims = dims + self.encoder = AudioEncoder( + self.dims.n_mels, + self.dims.n_audio_ctx, + self.dims.n_audio_state, + self.dims.n_audio_head, + self.dims.n_audio_layer, + ) + self.decoder = TextDecoder( + self.dims.n_vocab, + self.dims.n_text_ctx, + self.dims.n_text_state, + self.dims.n_text_head, + self.dims.n_text_layer, + ) + # use the last half among the decoder layers for time alignment by default; + # to use a specific set of heads, see `set_alignment_heads()` below. + all_heads = torch.zeros( + self.dims.n_text_layer, self.dims.n_text_head, dtype=torch.bool + ) + all_heads[self.dims.n_text_layer // 2 :] = True + self.register_buffer("alignment_heads", all_heads.to_sparse(), persistent=False) + + def set_alignment_heads(self, dump: bytes): + array = np.frombuffer( + gzip.decompress(base64.b85decode(dump)), dtype=bool + ).copy() + mask = torch.from_numpy(array).reshape( + self.dims.n_text_layer, self.dims.n_text_head + ) + self.register_buffer("alignment_heads", mask.to_sparse(), persistent=False) + + def embed_audio(self, mel: torch.Tensor): + return self.encoder(mel) + + def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor): + return self.decoder(tokens, audio_features) + + def forward( + self, mel: torch.Tensor, tokens: torch.Tensor + ) -> Dict[str, torch.Tensor]: + return self.decoder(tokens, self.encoder(mel)) + + @property + def device(self): + return next(self.parameters()).device + + @property + def is_multilingual(self): + return self.dims.n_vocab >= 51865 + + @property + def num_languages(self): + return self.dims.n_vocab - 51765 - int(self.is_multilingual) + + def install_kv_cache_hooks(self, cache: Optional[dict] = None): + """ + The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value + tensors calculated for the previous positions. This method returns a dictionary that stores + all caches, and the necessary hooks for the key and value projection modules that save the + intermediate tensors to be reused during later calculations. + + Returns + ------- + cache : Dict[nn.Module, torch.Tensor] + A dictionary object mapping the key/value projection modules to its cache + hooks : List[RemovableHandle] + List of PyTorch RemovableHandle objects to stop the hooks to be called + """ + cache = {**cache} if cache is not None else {} + hooks = [] + + def save_to_cache(module, _, output): + if module not in cache or output.shape[1] > self.dims.n_text_ctx: + # save as-is, for the first token or cross attention + cache[module] = output + else: + cache[module] = torch.cat([cache[module], output], dim=1).detach() + return cache[module] + + def install_hooks(layer: nn.Module): + if isinstance(layer, MultiHeadAttention): + hooks.append(layer.key.register_forward_hook(save_to_cache)) + hooks.append(layer.value.register_forward_hook(save_to_cache)) + + self.decoder.apply(install_hooks) + return cache, hooks + + detect_language = detect_language_function + transcribe = transcribe_function + decode = decode_function diff --git a/whisper_service_deploy/whisper/normalizers/__init__.py b/whisper_service_deploy/whisper/normalizers/__init__.py new file mode 100644 index 0000000..896d5e3 --- /dev/null +++ b/whisper_service_deploy/whisper/normalizers/__init__.py @@ -0,0 +1,2 @@ +from .basic import BasicTextNormalizer as BasicTextNormalizer +from .english import EnglishTextNormalizer as EnglishTextNormalizer diff --git a/whisper_service_deploy/whisper/normalizers/basic.py b/whisper_service_deploy/whisper/normalizers/basic.py new file mode 100644 index 0000000..8690ae7 --- /dev/null +++ b/whisper_service_deploy/whisper/normalizers/basic.py @@ -0,0 +1,80 @@ +import re +import unicodedata + +import regex + +# non-ASCII letters that are not separated by "NFKD" normalization +ADDITIONAL_DIACRITICS = { + "œ": "oe", + "Œ": "OE", + "ø": "o", + "Ø": "O", + "æ": "ae", + "Æ": "AE", + "ß": "ss", + "ẞ": "SS", + "đ": "d", + "Đ": "D", + "ð": "d", + "Ð": "D", + "þ": "th", + "Þ": "th", + "ł": "l", + "Ł": "L", +} + + +def remove_symbols_and_diacritics(s: str, keep=""): + """ + Replace any other markers, symbols, and punctuations with a space, + and drop any diacritics (category 'Mn' and some manual mappings) + """ + return "".join( + ( + c + if c in keep + else ( + ADDITIONAL_DIACRITICS[c] + if c in ADDITIONAL_DIACRITICS + else ( + "" + if unicodedata.category(c) == "Mn" + else " " if unicodedata.category(c)[0] in "MSP" else c + ) + ) + ) + for c in unicodedata.normalize("NFKD", s) + ) + + +def remove_symbols(s: str): + """ + Replace any other markers, symbols, punctuations with a space, keeping diacritics + """ + return "".join( + " " if unicodedata.category(c)[0] in "MSP" else c + for c in unicodedata.normalize("NFKC", s) + ) + + +class BasicTextNormalizer: + def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): + self.clean = ( + remove_symbols_and_diacritics if remove_diacritics else remove_symbols + ) + self.split_letters = split_letters + + def __call__(self, s: str): + s = s.lower() + s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets + s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis + s = self.clean(s).lower() + + if self.split_letters: + s = " ".join(regex.findall(r"\X", s, regex.U)) + + s = re.sub( + r"\s+", " ", s + ) # replace any successive whitespace characters with a space + + return s diff --git a/whisper_service_deploy/whisper/normalizers/english.json b/whisper_service_deploy/whisper/normalizers/english.json new file mode 100644 index 0000000..74a1c35 --- /dev/null +++ b/whisper_service_deploy/whisper/normalizers/english.json @@ -0,0 +1,1741 @@ +{ + "accessorise": "accessorize", + "accessorised": "accessorized", + "accessorises": "accessorizes", + "accessorising": "accessorizing", + "acclimatisation": "acclimatization", + "acclimatise": "acclimatize", + "acclimatised": "acclimatized", + "acclimatises": "acclimatizes", + "acclimatising": "acclimatizing", + "accoutrements": "accouterments", + "aeon": "eon", + "aeons": "eons", + "aerogramme": "aerogram", + "aerogrammes": "aerograms", + "aeroplane": "airplane", + "aeroplanes": "airplanes", + "aesthete": "esthete", + "aesthetes": "esthetes", + "aesthetic": "esthetic", + "aesthetically": "esthetically", + "aesthetics": "esthetics", + "aetiology": "etiology", + "ageing": "aging", + "aggrandisement": "aggrandizement", + "agonise": "agonize", + "agonised": "agonized", + "agonises": "agonizes", + "agonising": "agonizing", + "agonisingly": "agonizingly", + "almanack": "almanac", + "almanacks": "almanacs", + "aluminium": "aluminum", + "amortisable": "amortizable", + "amortisation": "amortization", + "amortisations": "amortizations", + "amortise": "amortize", + "amortised": "amortized", + "amortises": "amortizes", + "amortising": "amortizing", + "amphitheatre": "amphitheater", + "amphitheatres": "amphitheaters", + "anaemia": "anemia", + "anaemic": "anemic", + "anaesthesia": "anesthesia", + "anaesthetic": "anesthetic", + "anaesthetics": "anesthetics", + "anaesthetise": "anesthetize", + "anaesthetised": "anesthetized", + "anaesthetises": "anesthetizes", + "anaesthetising": "anesthetizing", + "anaesthetist": "anesthetist", + "anaesthetists": "anesthetists", + "anaesthetize": "anesthetize", + "anaesthetized": "anesthetized", + "anaesthetizes": "anesthetizes", + "anaesthetizing": "anesthetizing", + "analogue": "analog", + "analogues": "analogs", + "analyse": "analyze", + "analysed": "analyzed", + "analyses": "analyzes", + "analysing": "analyzing", + "anglicise": "anglicize", + "anglicised": "anglicized", + "anglicises": "anglicizes", + "anglicising": "anglicizing", + "annualised": "annualized", + "antagonise": "antagonize", + "antagonised": "antagonized", + "antagonises": "antagonizes", + "antagonising": "antagonizing", + "apologise": "apologize", + "apologised": "apologized", + "apologises": "apologizes", + "apologising": "apologizing", + "appal": "appall", + "appals": "appalls", + "appetiser": "appetizer", + "appetisers": "appetizers", + "appetising": "appetizing", + "appetisingly": "appetizingly", + "arbour": "arbor", + "arbours": "arbors", + "archeological": "archaeological", + "archaeologically": "archeologically", + "archaeologist": "archeologist", + "archaeologists": "archeologists", + "archaeology": "archeology", + "ardour": "ardor", + "armour": "armor", + "armoured": "armored", + "armourer": "armorer", + "armourers": "armorers", + "armouries": "armories", + "armoury": "armory", + "artefact": "artifact", + "artefacts": "artifacts", + "authorise": "authorize", + "authorised": "authorized", + "authorises": "authorizes", + "authorising": "authorizing", + "axe": "ax", + "backpedalled": "backpedaled", + "backpedalling": "backpedaling", + "bannister": "banister", + "bannisters": "banisters", + "baptise": "baptize", + "baptised": "baptized", + "baptises": "baptizes", + "baptising": "baptizing", + "bastardise": "bastardize", + "bastardised": "bastardized", + "bastardises": "bastardizes", + "bastardising": "bastardizing", + "battleax": "battleaxe", + "baulk": "balk", + "baulked": "balked", + "baulking": "balking", + "baulks": "balks", + "bedevilled": "bedeviled", + "bedevilling": "bedeviling", + "behaviour": "behavior", + "behavioural": "behavioral", + "behaviourism": "behaviorism", + "behaviourist": "behaviorist", + "behaviourists": "behaviorists", + "behaviours": "behaviors", + "behove": "behoove", + "behoved": "behooved", + "behoves": "behooves", + "bejewelled": "bejeweled", + "belabour": "belabor", + "belaboured": "belabored", + "belabouring": "belaboring", + "belabours": "belabors", + "bevelled": "beveled", + "bevvies": "bevies", + "bevvy": "bevy", + "biassed": "biased", + "biassing": "biasing", + "bingeing": "binging", + "bougainvillaea": "bougainvillea", + "bougainvillaeas": "bougainvilleas", + "bowdlerise": "bowdlerize", + "bowdlerised": "bowdlerized", + "bowdlerises": "bowdlerizes", + "bowdlerising": "bowdlerizing", + "breathalyse": "breathalyze", + "breathalysed": "breathalyzed", + "breathalyser": "breathalyzer", + "breathalysers": "breathalyzers", + "breathalyses": "breathalyzes", + "breathalysing": "breathalyzing", + "brutalise": "brutalize", + "brutalised": "brutalized", + "brutalises": "brutalizes", + "brutalising": "brutalizing", + "busses": "buses", + "bussing": "busing", + "caesarean": "cesarean", + "caesareans": "cesareans", + "calibre": "caliber", + "calibres": "calibers", + "calliper": "caliper", + "callipers": "calipers", + "callisthenics": "calisthenics", + "canalise": "canalize", + "canalised": "canalized", + "canalises": "canalizes", + "canalising": "canalizing", + "cancelation": "cancellation", + "cancelations": "cancellations", + "cancelled": "canceled", + "cancelling": "canceling", + "candour": "candor", + "cannibalise": "cannibalize", + "cannibalised": "cannibalized", + "cannibalises": "cannibalizes", + "cannibalising": "cannibalizing", + "canonise": "canonize", + "canonised": "canonized", + "canonises": "canonizes", + "canonising": "canonizing", + "capitalise": "capitalize", + "capitalised": "capitalized", + "capitalises": "capitalizes", + "capitalising": "capitalizing", + "caramelise": "caramelize", + "caramelised": "caramelized", + "caramelises": "caramelizes", + "caramelising": "caramelizing", + "carbonise": "carbonize", + "carbonised": "carbonized", + "carbonises": "carbonizes", + "carbonising": "carbonizing", + "carolled": "caroled", + "carolling": "caroling", + "catalogue": "catalog", + "catalogued": "cataloged", + "catalogues": "catalogs", + "cataloguing": "cataloging", + "catalyse": "catalyze", + "catalysed": "catalyzed", + "catalyses": "catalyzes", + "catalysing": "catalyzing", + "categorise": "categorize", + "categorised": "categorized", + "categorises": "categorizes", + "categorising": "categorizing", + "cauterise": "cauterize", + "cauterised": "cauterized", + "cauterises": "cauterizes", + "cauterising": "cauterizing", + "cavilled": "caviled", + "cavilling": "caviling", + "centigramme": "centigram", + "centigrammes": "centigrams", + "centilitre": "centiliter", + "centilitres": "centiliters", + "centimetre": "centimeter", + "centimetres": "centimeters", + "centralise": "centralize", + "centralised": "centralized", + "centralises": "centralizes", + "centralising": "centralizing", + "centre": "center", + "centred": "centered", + "centrefold": "centerfold", + "centrefolds": "centerfolds", + "centrepiece": "centerpiece", + "centrepieces": "centerpieces", + "centres": "centers", + "channelled": "channeled", + "channelling": "channeling", + "characterise": "characterize", + "characterised": "characterized", + "characterises": "characterizes", + "characterising": "characterizing", + "cheque": "check", + "chequebook": "checkbook", + "chequebooks": "checkbooks", + "chequered": "checkered", + "cheques": "checks", + "chilli": "chili", + "chimaera": "chimera", + "chimaeras": "chimeras", + "chiselled": "chiseled", + "chiselling": "chiseling", + "circularise": "circularize", + "circularised": "circularized", + "circularises": "circularizes", + "circularising": "circularizing", + "civilise": "civilize", + "civilised": "civilized", + "civilises": "civilizes", + "civilising": "civilizing", + "clamour": "clamor", + "clamoured": "clamored", + "clamouring": "clamoring", + "clamours": "clamors", + "clangour": "clangor", + "clarinettist": "clarinetist", + "clarinettists": "clarinetists", + "collectivise": "collectivize", + "collectivised": "collectivized", + "collectivises": "collectivizes", + "collectivising": "collectivizing", + "colonisation": "colonization", + "colonise": "colonize", + "colonised": "colonized", + "coloniser": "colonizer", + "colonisers": "colonizers", + "colonises": "colonizes", + "colonising": "colonizing", + "colour": "color", + "colourant": "colorant", + "colourants": "colorants", + "coloured": "colored", + "coloureds": "coloreds", + "colourful": "colorful", + "colourfully": "colorfully", + "colouring": "coloring", + "colourize": "colorize", + "colourized": "colorized", + "colourizes": "colorizes", + "colourizing": "colorizing", + "colourless": "colorless", + "colours": "colors", + "commercialise": "commercialize", + "commercialised": "commercialized", + "commercialises": "commercializes", + "commercialising": "commercializing", + "compartmentalise": "compartmentalize", + "compartmentalised": "compartmentalized", + "compartmentalises": "compartmentalizes", + "compartmentalising": "compartmentalizing", + "computerise": "computerize", + "computerised": "computerized", + "computerises": "computerizes", + "computerising": "computerizing", + "conceptualise": "conceptualize", + "conceptualised": "conceptualized", + "conceptualises": "conceptualizes", + "conceptualising": "conceptualizing", + "connexion": "connection", + "connexions": "connections", + "contextualise": "contextualize", + "contextualised": "contextualized", + "contextualises": "contextualizes", + "contextualising": "contextualizing", + "cosier": "cozier", + "cosies": "cozies", + "cosiest": "coziest", + "cosily": "cozily", + "cosiness": "coziness", + "cosy": "cozy", + "councillor": "councilor", + "councillors": "councilors", + "counselled": "counseled", + "counselling": "counseling", + "counsellor": "counselor", + "counsellors": "counselors", + "crenelated": "crenellated", + "criminalise": "criminalize", + "criminalised": "criminalized", + "criminalises": "criminalizes", + "criminalising": "criminalizing", + "criticise": "criticize", + "criticised": "criticized", + "criticises": "criticizes", + "criticising": "criticizing", + "crueller": "crueler", + "cruellest": "cruelest", + "crystallisation": "crystallization", + "crystallise": "crystallize", + "crystallised": "crystallized", + "crystallises": "crystallizes", + "crystallising": "crystallizing", + "cudgelled": "cudgeled", + "cudgelling": "cudgeling", + "customise": "customize", + "customised": "customized", + "customises": "customizes", + "customising": "customizing", + "cypher": "cipher", + "cyphers": "ciphers", + "decentralisation": "decentralization", + "decentralise": "decentralize", + "decentralised": "decentralized", + "decentralises": "decentralizes", + "decentralising": "decentralizing", + "decriminalisation": "decriminalization", + "decriminalise": "decriminalize", + "decriminalised": "decriminalized", + "decriminalises": "decriminalizes", + "decriminalising": "decriminalizing", + "defence": "defense", + "defenceless": "defenseless", + "defences": "defenses", + "dehumanisation": "dehumanization", + "dehumanise": "dehumanize", + "dehumanised": "dehumanized", + "dehumanises": "dehumanizes", + "dehumanising": "dehumanizing", + "demeanour": "demeanor", + "demilitarisation": "demilitarization", + "demilitarise": "demilitarize", + "demilitarised": "demilitarized", + "demilitarises": "demilitarizes", + "demilitarising": "demilitarizing", + "demobilisation": "demobilization", + "demobilise": "demobilize", + "demobilised": "demobilized", + "demobilises": "demobilizes", + "demobilising": "demobilizing", + "democratisation": "democratization", + "democratise": "democratize", + "democratised": "democratized", + "democratises": "democratizes", + "democratising": "democratizing", + "demonise": "demonize", + "demonised": "demonized", + "demonises": "demonizes", + "demonising": "demonizing", + "demoralisation": "demoralization", + "demoralise": "demoralize", + "demoralised": "demoralized", + "demoralises": "demoralizes", + "demoralising": "demoralizing", + "denationalisation": "denationalization", + "denationalise": "denationalize", + "denationalised": "denationalized", + "denationalises": "denationalizes", + "denationalising": "denationalizing", + "deodorise": "deodorize", + "deodorised": "deodorized", + "deodorises": "deodorizes", + "deodorising": "deodorizing", + "depersonalise": "depersonalize", + "depersonalised": "depersonalized", + "depersonalises": "depersonalizes", + "depersonalising": "depersonalizing", + "deputise": "deputize", + "deputised": "deputized", + "deputises": "deputizes", + "deputising": "deputizing", + "desensitisation": "desensitization", + "desensitise": "desensitize", + "desensitised": "desensitized", + "desensitises": "desensitizes", + "desensitising": "desensitizing", + "destabilisation": "destabilization", + "destabilise": "destabilize", + "destabilised": "destabilized", + "destabilises": "destabilizes", + "destabilising": "destabilizing", + "dialled": "dialed", + "dialling": "dialing", + "dialogue": "dialog", + "dialogues": "dialogs", + "diarrhoea": "diarrhea", + "digitise": "digitize", + "digitised": "digitized", + "digitises": "digitizes", + "digitising": "digitizing", + "disc": "disk", + "discolour": "discolor", + "discoloured": "discolored", + "discolouring": "discoloring", + "discolours": "discolors", + "discs": "disks", + "disembowelled": "disemboweled", + "disembowelling": "disemboweling", + "disfavour": "disfavor", + "dishevelled": "disheveled", + "dishonour": "dishonor", + "dishonourable": "dishonorable", + "dishonourably": "dishonorably", + "dishonoured": "dishonored", + "dishonouring": "dishonoring", + "dishonours": "dishonors", + "disorganisation": "disorganization", + "disorganised": "disorganized", + "distil": "distill", + "distils": "distills", + "dramatisation": "dramatization", + "dramatisations": "dramatizations", + "dramatise": "dramatize", + "dramatised": "dramatized", + "dramatises": "dramatizes", + "dramatising": "dramatizing", + "draught": "draft", + "draughtboard": "draftboard", + "draughtboards": "draftboards", + "draughtier": "draftier", + "draughtiest": "draftiest", + "draughts": "drafts", + "draughtsman": "draftsman", + "draughtsmanship": "draftsmanship", + "draughtsmen": "draftsmen", + "draughtswoman": "draftswoman", + "draughtswomen": "draftswomen", + "draughty": "drafty", + "drivelled": "driveled", + "drivelling": "driveling", + "duelled": "dueled", + "duelling": "dueling", + "economise": "economize", + "economised": "economized", + "economises": "economizes", + "economising": "economizing", + "edoema": "edema", + "editorialise": "editorialize", + "editorialised": "editorialized", + "editorialises": "editorializes", + "editorialising": "editorializing", + "empathise": "empathize", + "empathised": "empathized", + "empathises": "empathizes", + "empathising": "empathizing", + "emphasise": "emphasize", + "emphasised": "emphasized", + "emphasises": "emphasizes", + "emphasising": "emphasizing", + "enamelled": "enameled", + "enamelling": "enameling", + "enamoured": "enamored", + "encyclopaedia": "encyclopedia", + "encyclopaedias": "encyclopedias", + "encyclopaedic": "encyclopedic", + "endeavour": "endeavor", + "endeavoured": "endeavored", + "endeavouring": "endeavoring", + "endeavours": "endeavors", + "energise": "energize", + "energised": "energized", + "energises": "energizes", + "energising": "energizing", + "enrol": "enroll", + "enrols": "enrolls", + "enthral": "enthrall", + "enthrals": "enthralls", + "epaulette": "epaulet", + "epaulettes": "epaulets", + "epicentre": "epicenter", + "epicentres": "epicenters", + "epilogue": "epilog", + "epilogues": "epilogs", + "epitomise": "epitomize", + "epitomised": "epitomized", + "epitomises": "epitomizes", + "epitomising": "epitomizing", + "equalisation": "equalization", + "equalise": "equalize", + "equalised": "equalized", + "equaliser": "equalizer", + "equalisers": "equalizers", + "equalises": "equalizes", + "equalising": "equalizing", + "eulogise": "eulogize", + "eulogised": "eulogized", + "eulogises": "eulogizes", + "eulogising": "eulogizing", + "evangelise": "evangelize", + "evangelised": "evangelized", + "evangelises": "evangelizes", + "evangelising": "evangelizing", + "exorcise": "exorcize", + "exorcised": "exorcized", + "exorcises": "exorcizes", + "exorcising": "exorcizing", + "extemporisation": "extemporization", + "extemporise": "extemporize", + "extemporised": "extemporized", + "extemporises": "extemporizes", + "extemporising": "extemporizing", + "externalisation": "externalization", + "externalisations": "externalizations", + "externalise": "externalize", + "externalised": "externalized", + "externalises": "externalizes", + "externalising": "externalizing", + "factorise": "factorize", + "factorised": "factorized", + "factorises": "factorizes", + "factorising": "factorizing", + "faecal": "fecal", + "faeces": "feces", + "familiarisation": "familiarization", + "familiarise": "familiarize", + "familiarised": "familiarized", + "familiarises": "familiarizes", + "familiarising": "familiarizing", + "fantasise": "fantasize", + "fantasised": "fantasized", + "fantasises": "fantasizes", + "fantasising": "fantasizing", + "favour": "favor", + "favourable": "favorable", + "favourably": "favorably", + "favoured": "favored", + "favouring": "favoring", + "favourite": "favorite", + "favourites": "favorites", + "favouritism": "favoritism", + "favours": "favors", + "feminise": "feminize", + "feminised": "feminized", + "feminises": "feminizes", + "feminising": "feminizing", + "fertilisation": "fertilization", + "fertilise": "fertilize", + "fertilised": "fertilized", + "fertiliser": "fertilizer", + "fertilisers": "fertilizers", + "fertilises": "fertilizes", + "fertilising": "fertilizing", + "fervour": "fervor", + "fibre": "fiber", + "fibreglass": "fiberglass", + "fibres": "fibers", + "fictionalisation": "fictionalization", + "fictionalisations": "fictionalizations", + "fictionalise": "fictionalize", + "fictionalised": "fictionalized", + "fictionalises": "fictionalizes", + "fictionalising": "fictionalizing", + "fillet": "filet", + "filleted": "fileted", + "filleting": "fileting", + "fillets": "filets", + "finalisation": "finalization", + "finalise": "finalize", + "finalised": "finalized", + "finalises": "finalizes", + "finalising": "finalizing", + "flautist": "flutist", + "flautists": "flutists", + "flavour": "flavor", + "flavoured": "flavored", + "flavouring": "flavoring", + "flavourings": "flavorings", + "flavourless": "flavorless", + "flavours": "flavors", + "flavoursome": "flavorsome", + "flyer / flier": "flier / flyer", + "foetal": "fetal", + "foetid": "fetid", + "foetus": "fetus", + "foetuses": "fetuses", + "formalisation": "formalization", + "formalise": "formalize", + "formalised": "formalized", + "formalises": "formalizes", + "formalising": "formalizing", + "fossilisation": "fossilization", + "fossilise": "fossilize", + "fossilised": "fossilized", + "fossilises": "fossilizes", + "fossilising": "fossilizing", + "fraternisation": "fraternization", + "fraternise": "fraternize", + "fraternised": "fraternized", + "fraternises": "fraternizes", + "fraternising": "fraternizing", + "fulfil": "fulfill", + "fulfilment": "fulfillment", + "fulfils": "fulfills", + "funnelled": "funneled", + "funnelling": "funneling", + "galvanise": "galvanize", + "galvanised": "galvanized", + "galvanises": "galvanizes", + "galvanising": "galvanizing", + "gambolled": "gamboled", + "gambolling": "gamboling", + "gaol": "jail", + "gaolbird": "jailbird", + "gaolbirds": "jailbirds", + "gaolbreak": "jailbreak", + "gaolbreaks": "jailbreaks", + "gaoled": "jailed", + "gaoler": "jailer", + "gaolers": "jailers", + "gaoling": "jailing", + "gaols": "jails", + "gasses": "gases", + "gage": "gauge", + "gaged": "gauged", + "gages": "gauges", + "gaging": "gauging", + "generalisation": "generalization", + "generalisations": "generalizations", + "generalise": "generalize", + "generalised": "generalized", + "generalises": "generalizes", + "generalising": "generalizing", + "ghettoise": "ghettoize", + "ghettoised": "ghettoized", + "ghettoises": "ghettoizes", + "ghettoising": "ghettoizing", + "gipsies": "gypsies", + "glamorise": "glamorize", + "glamorised": "glamorized", + "glamorises": "glamorizes", + "glamorising": "glamorizing", + "glamor": "glamour", + "globalisation": "globalization", + "globalise": "globalize", + "globalised": "globalized", + "globalises": "globalizes", + "globalising": "globalizing", + "glueing": "gluing", + "goitre": "goiter", + "goitres": "goiters", + "gonorrhoea": "gonorrhea", + "gramme": "gram", + "grammes": "grams", + "gravelled": "graveled", + "grey": "gray", + "greyed": "grayed", + "greying": "graying", + "greyish": "grayish", + "greyness": "grayness", + "greys": "grays", + "grovelled": "groveled", + "grovelling": "groveling", + "groyne": "groin", + "groynes": "groins", + "gruelling": "grueling", + "gruellingly": "gruelingly", + "gryphon": "griffin", + "gryphons": "griffins", + "gynaecological": "gynecological", + "gynaecologist": "gynecologist", + "gynaecologists": "gynecologists", + "gynaecology": "gynecology", + "haematological": "hematological", + "haematologist": "hematologist", + "haematologists": "hematologists", + "haematology": "hematology", + "haemoglobin": "hemoglobin", + "haemophilia": "hemophilia", + "haemophiliac": "hemophiliac", + "haemophiliacs": "hemophiliacs", + "haemorrhage": "hemorrhage", + "haemorrhaged": "hemorrhaged", + "haemorrhages": "hemorrhages", + "haemorrhaging": "hemorrhaging", + "haemorrhoids": "hemorrhoids", + "harbour": "harbor", + "harboured": "harbored", + "harbouring": "harboring", + "harbours": "harbors", + "harmonisation": "harmonization", + "harmonise": "harmonize", + "harmonised": "harmonized", + "harmonises": "harmonizes", + "harmonising": "harmonizing", + "homoeopath": "homeopath", + "homoeopathic": "homeopathic", + "homoeopaths": "homeopaths", + "homoeopathy": "homeopathy", + "homogenise": "homogenize", + "homogenised": "homogenized", + "homogenises": "homogenizes", + "homogenising": "homogenizing", + "honour": "honor", + "honourable": "honorable", + "honourably": "honorably", + "honoured": "honored", + "honouring": "honoring", + "honours": "honors", + "hospitalisation": "hospitalization", + "hospitalise": "hospitalize", + "hospitalised": "hospitalized", + "hospitalises": "hospitalizes", + "hospitalising": "hospitalizing", + "humanise": "humanize", + "humanised": "humanized", + "humanises": "humanizes", + "humanising": "humanizing", + "humour": "humor", + "humoured": "humored", + "humouring": "humoring", + "humourless": "humorless", + "humours": "humors", + "hybridise": "hybridize", + "hybridised": "hybridized", + "hybridises": "hybridizes", + "hybridising": "hybridizing", + "hypnotise": "hypnotize", + "hypnotised": "hypnotized", + "hypnotises": "hypnotizes", + "hypnotising": "hypnotizing", + "hypothesise": "hypothesize", + "hypothesised": "hypothesized", + "hypothesises": "hypothesizes", + "hypothesising": "hypothesizing", + "idealisation": "idealization", + "idealise": "idealize", + "idealised": "idealized", + "idealises": "idealizes", + "idealising": "idealizing", + "idolise": "idolize", + "idolised": "idolized", + "idolises": "idolizes", + "idolising": "idolizing", + "immobilisation": "immobilization", + "immobilise": "immobilize", + "immobilised": "immobilized", + "immobiliser": "immobilizer", + "immobilisers": "immobilizers", + "immobilises": "immobilizes", + "immobilising": "immobilizing", + "immortalise": "immortalize", + "immortalised": "immortalized", + "immortalises": "immortalizes", + "immortalising": "immortalizing", + "immunisation": "immunization", + "immunise": "immunize", + "immunised": "immunized", + "immunises": "immunizes", + "immunising": "immunizing", + "impanelled": "impaneled", + "impanelling": "impaneling", + "imperilled": "imperiled", + "imperilling": "imperiling", + "individualise": "individualize", + "individualised": "individualized", + "individualises": "individualizes", + "individualising": "individualizing", + "industrialise": "industrialize", + "industrialised": "industrialized", + "industrialises": "industrializes", + "industrialising": "industrializing", + "inflexion": "inflection", + "inflexions": "inflections", + "initialise": "initialize", + "initialised": "initialized", + "initialises": "initializes", + "initialising": "initializing", + "initialled": "initialed", + "initialling": "initialing", + "instal": "install", + "instalment": "installment", + "instalments": "installments", + "instals": "installs", + "instil": "instill", + "instils": "instills", + "institutionalisation": "institutionalization", + "institutionalise": "institutionalize", + "institutionalised": "institutionalized", + "institutionalises": "institutionalizes", + "institutionalising": "institutionalizing", + "intellectualise": "intellectualize", + "intellectualised": "intellectualized", + "intellectualises": "intellectualizes", + "intellectualising": "intellectualizing", + "internalisation": "internalization", + "internalise": "internalize", + "internalised": "internalized", + "internalises": "internalizes", + "internalising": "internalizing", + "internationalisation": "internationalization", + "internationalise": "internationalize", + "internationalised": "internationalized", + "internationalises": "internationalizes", + "internationalising": "internationalizing", + "ionisation": "ionization", + "ionise": "ionize", + "ionised": "ionized", + "ioniser": "ionizer", + "ionisers": "ionizers", + "ionises": "ionizes", + "ionising": "ionizing", + "italicise": "italicize", + "italicised": "italicized", + "italicises": "italicizes", + "italicising": "italicizing", + "itemise": "itemize", + "itemised": "itemized", + "itemises": "itemizes", + "itemising": "itemizing", + "jeopardise": "jeopardize", + "jeopardised": "jeopardized", + "jeopardises": "jeopardizes", + "jeopardising": "jeopardizing", + "jewelled": "jeweled", + "jeweller": "jeweler", + "jewellers": "jewelers", + "jewellery": "jewelry", + "judgement": "judgment", + "kilogramme": "kilogram", + "kilogrammes": "kilograms", + "kilometre": "kilometer", + "kilometres": "kilometers", + "labelled": "labeled", + "labelling": "labeling", + "labour": "labor", + "laboured": "labored", + "labourer": "laborer", + "labourers": "laborers", + "labouring": "laboring", + "labours": "labors", + "lacklustre": "lackluster", + "legalisation": "legalization", + "legalise": "legalize", + "legalised": "legalized", + "legalises": "legalizes", + "legalising": "legalizing", + "legitimise": "legitimize", + "legitimised": "legitimized", + "legitimises": "legitimizes", + "legitimising": "legitimizing", + "leukaemia": "leukemia", + "levelled": "leveled", + "leveller": "leveler", + "levellers": "levelers", + "levelling": "leveling", + "libelled": "libeled", + "libelling": "libeling", + "libellous": "libelous", + "liberalisation": "liberalization", + "liberalise": "liberalize", + "liberalised": "liberalized", + "liberalises": "liberalizes", + "liberalising": "liberalizing", + "licence": "license", + "licenced": "licensed", + "licences": "licenses", + "licencing": "licensing", + "likeable": "likable", + "lionisation": "lionization", + "lionise": "lionize", + "lionised": "lionized", + "lionises": "lionizes", + "lionising": "lionizing", + "liquidise": "liquidize", + "liquidised": "liquidized", + "liquidiser": "liquidizer", + "liquidisers": "liquidizers", + "liquidises": "liquidizes", + "liquidising": "liquidizing", + "litre": "liter", + "litres": "liters", + "localise": "localize", + "localised": "localized", + "localises": "localizes", + "localising": "localizing", + "louvre": "louver", + "louvred": "louvered", + "louvres": "louvers", + "lustre": "luster", + "magnetise": "magnetize", + "magnetised": "magnetized", + "magnetises": "magnetizes", + "magnetising": "magnetizing", + "manoeuvrability": "maneuverability", + "manoeuvrable": "maneuverable", + "manoeuvre": "maneuver", + "manoeuvred": "maneuvered", + "manoeuvres": "maneuvers", + "manoeuvring": "maneuvering", + "manoeuvrings": "maneuverings", + "marginalisation": "marginalization", + "marginalise": "marginalize", + "marginalised": "marginalized", + "marginalises": "marginalizes", + "marginalising": "marginalizing", + "marshalled": "marshaled", + "marshalling": "marshaling", + "marvelled": "marveled", + "marvelling": "marveling", + "marvellous": "marvelous", + "marvellously": "marvelously", + "materialisation": "materialization", + "materialise": "materialize", + "materialised": "materialized", + "materialises": "materializes", + "materialising": "materializing", + "maximisation": "maximization", + "maximise": "maximize", + "maximised": "maximized", + "maximises": "maximizes", + "maximising": "maximizing", + "meagre": "meager", + "mechanisation": "mechanization", + "mechanise": "mechanize", + "mechanised": "mechanized", + "mechanises": "mechanizes", + "mechanising": "mechanizing", + "mediaeval": "medieval", + "memorialise": "memorialize", + "memorialised": "memorialized", + "memorialises": "memorializes", + "memorialising": "memorializing", + "memorise": "memorize", + "memorised": "memorized", + "memorises": "memorizes", + "memorising": "memorizing", + "mesmerise": "mesmerize", + "mesmerised": "mesmerized", + "mesmerises": "mesmerizes", + "mesmerising": "mesmerizing", + "metabolise": "metabolize", + "metabolised": "metabolized", + "metabolises": "metabolizes", + "metabolising": "metabolizing", + "metre": "meter", + "metres": "meters", + "micrometre": "micrometer", + "micrometres": "micrometers", + "militarise": "militarize", + "militarised": "militarized", + "militarises": "militarizes", + "militarising": "militarizing", + "milligramme": "milligram", + "milligrammes": "milligrams", + "millilitre": "milliliter", + "millilitres": "milliliters", + "millimetre": "millimeter", + "millimetres": "millimeters", + "miniaturisation": "miniaturization", + "miniaturise": "miniaturize", + "miniaturised": "miniaturized", + "miniaturises": "miniaturizes", + "miniaturising": "miniaturizing", + "minibusses": "minibuses", + "minimise": "minimize", + "minimised": "minimized", + "minimises": "minimizes", + "minimising": "minimizing", + "misbehaviour": "misbehavior", + "misdemeanour": "misdemeanor", + "misdemeanours": "misdemeanors", + "misspelt": "misspelled", + "mitre": "miter", + "mitres": "miters", + "mobilisation": "mobilization", + "mobilise": "mobilize", + "mobilised": "mobilized", + "mobilises": "mobilizes", + "mobilising": "mobilizing", + "modelled": "modeled", + "modeller": "modeler", + "modellers": "modelers", + "modelling": "modeling", + "modernise": "modernize", + "modernised": "modernized", + "modernises": "modernizes", + "modernising": "modernizing", + "moisturise": "moisturize", + "moisturised": "moisturized", + "moisturiser": "moisturizer", + "moisturisers": "moisturizers", + "moisturises": "moisturizes", + "moisturising": "moisturizing", + "monologue": "monolog", + "monologues": "monologs", + "monopolisation": "monopolization", + "monopolise": "monopolize", + "monopolised": "monopolized", + "monopolises": "monopolizes", + "monopolising": "monopolizing", + "moralise": "moralize", + "moralised": "moralized", + "moralises": "moralizes", + "moralising": "moralizing", + "motorised": "motorized", + "mould": "mold", + "moulded": "molded", + "moulder": "molder", + "mouldered": "moldered", + "mouldering": "moldering", + "moulders": "molders", + "mouldier": "moldier", + "mouldiest": "moldiest", + "moulding": "molding", + "mouldings": "moldings", + "moulds": "molds", + "mouldy": "moldy", + "moult": "molt", + "moulted": "molted", + "moulting": "molting", + "moults": "molts", + "moustache": "mustache", + "moustached": "mustached", + "moustaches": "mustaches", + "moustachioed": "mustachioed", + "multicoloured": "multicolored", + "nationalisation": "nationalization", + "nationalisations": "nationalizations", + "nationalise": "nationalize", + "nationalised": "nationalized", + "nationalises": "nationalizes", + "nationalising": "nationalizing", + "naturalisation": "naturalization", + "naturalise": "naturalize", + "naturalised": "naturalized", + "naturalises": "naturalizes", + "naturalising": "naturalizing", + "neighbour": "neighbor", + "neighbourhood": "neighborhood", + "neighbourhoods": "neighborhoods", + "neighbouring": "neighboring", + "neighbourliness": "neighborliness", + "neighbourly": "neighborly", + "neighbours": "neighbors", + "neutralisation": "neutralization", + "neutralise": "neutralize", + "neutralised": "neutralized", + "neutralises": "neutralizes", + "neutralising": "neutralizing", + "normalisation": "normalization", + "normalise": "normalize", + "normalised": "normalized", + "normalises": "normalizes", + "normalising": "normalizing", + "odour": "odor", + "odourless": "odorless", + "odours": "odors", + "oesophagus": "esophagus", + "oesophaguses": "esophaguses", + "oestrogen": "estrogen", + "offence": "offense", + "offences": "offenses", + "omelette": "omelet", + "omelettes": "omelets", + "optimise": "optimize", + "optimised": "optimized", + "optimises": "optimizes", + "optimising": "optimizing", + "organisation": "organization", + "organisational": "organizational", + "organisations": "organizations", + "organise": "organize", + "organised": "organized", + "organiser": "organizer", + "organisers": "organizers", + "organises": "organizes", + "organising": "organizing", + "orthopaedic": "orthopedic", + "orthopaedics": "orthopedics", + "ostracise": "ostracize", + "ostracised": "ostracized", + "ostracises": "ostracizes", + "ostracising": "ostracizing", + "outmanoeuvre": "outmaneuver", + "outmanoeuvred": "outmaneuvered", + "outmanoeuvres": "outmaneuvers", + "outmanoeuvring": "outmaneuvering", + "overemphasise": "overemphasize", + "overemphasised": "overemphasized", + "overemphasises": "overemphasizes", + "overemphasising": "overemphasizing", + "oxidisation": "oxidization", + "oxidise": "oxidize", + "oxidised": "oxidized", + "oxidises": "oxidizes", + "oxidising": "oxidizing", + "paederast": "pederast", + "paederasts": "pederasts", + "paediatric": "pediatric", + "paediatrician": "pediatrician", + "paediatricians": "pediatricians", + "paediatrics": "pediatrics", + "paedophile": "pedophile", + "paedophiles": "pedophiles", + "paedophilia": "pedophilia", + "palaeolithic": "paleolithic", + "palaeontologist": "paleontologist", + "palaeontologists": "paleontologists", + "palaeontology": "paleontology", + "panelled": "paneled", + "panelling": "paneling", + "panellist": "panelist", + "panellists": "panelists", + "paralyse": "paralyze", + "paralysed": "paralyzed", + "paralyses": "paralyzes", + "paralysing": "paralyzing", + "parcelled": "parceled", + "parcelling": "parceling", + "parlour": "parlor", + "parlours": "parlors", + "particularise": "particularize", + "particularised": "particularized", + "particularises": "particularizes", + "particularising": "particularizing", + "passivisation": "passivization", + "passivise": "passivize", + "passivised": "passivized", + "passivises": "passivizes", + "passivising": "passivizing", + "pasteurisation": "pasteurization", + "pasteurise": "pasteurize", + "pasteurised": "pasteurized", + "pasteurises": "pasteurizes", + "pasteurising": "pasteurizing", + "patronise": "patronize", + "patronised": "patronized", + "patronises": "patronizes", + "patronising": "patronizing", + "patronisingly": "patronizingly", + "pedalled": "pedaled", + "pedalling": "pedaling", + "pedestrianisation": "pedestrianization", + "pedestrianise": "pedestrianize", + "pedestrianised": "pedestrianized", + "pedestrianises": "pedestrianizes", + "pedestrianising": "pedestrianizing", + "penalise": "penalize", + "penalised": "penalized", + "penalises": "penalizes", + "penalising": "penalizing", + "pencilled": "penciled", + "pencilling": "penciling", + "personalise": "personalize", + "personalised": "personalized", + "personalises": "personalizes", + "personalising": "personalizing", + "pharmacopoeia": "pharmacopeia", + "pharmacopoeias": "pharmacopeias", + "philosophise": "philosophize", + "philosophised": "philosophized", + "philosophises": "philosophizes", + "philosophising": "philosophizing", + "philtre": "filter", + "philtres": "filters", + "phoney": "phony", + "plagiarise": "plagiarize", + "plagiarised": "plagiarized", + "plagiarises": "plagiarizes", + "plagiarising": "plagiarizing", + "plough": "plow", + "ploughed": "plowed", + "ploughing": "plowing", + "ploughman": "plowman", + "ploughmen": "plowmen", + "ploughs": "plows", + "ploughshare": "plowshare", + "ploughshares": "plowshares", + "polarisation": "polarization", + "polarise": "polarize", + "polarised": "polarized", + "polarises": "polarizes", + "polarising": "polarizing", + "politicisation": "politicization", + "politicise": "politicize", + "politicised": "politicized", + "politicises": "politicizes", + "politicising": "politicizing", + "popularisation": "popularization", + "popularise": "popularize", + "popularised": "popularized", + "popularises": "popularizes", + "popularising": "popularizing", + "pouffe": "pouf", + "pouffes": "poufs", + "practise": "practice", + "practised": "practiced", + "practises": "practices", + "practising": "practicing", + "praesidium": "presidium", + "praesidiums": "presidiums", + "pressurisation": "pressurization", + "pressurise": "pressurize", + "pressurised": "pressurized", + "pressurises": "pressurizes", + "pressurising": "pressurizing", + "pretence": "pretense", + "pretences": "pretenses", + "primaeval": "primeval", + "prioritisation": "prioritization", + "prioritise": "prioritize", + "prioritised": "prioritized", + "prioritises": "prioritizes", + "prioritising": "prioritizing", + "privatisation": "privatization", + "privatisations": "privatizations", + "privatise": "privatize", + "privatised": "privatized", + "privatises": "privatizes", + "privatising": "privatizing", + "professionalisation": "professionalization", + "professionalise": "professionalize", + "professionalised": "professionalized", + "professionalises": "professionalizes", + "professionalising": "professionalizing", + "programme": "program", + "programmes": "programs", + "prologue": "prolog", + "prologues": "prologs", + "propagandise": "propagandize", + "propagandised": "propagandized", + "propagandises": "propagandizes", + "propagandising": "propagandizing", + "proselytise": "proselytize", + "proselytised": "proselytized", + "proselytiser": "proselytizer", + "proselytisers": "proselytizers", + "proselytises": "proselytizes", + "proselytising": "proselytizing", + "psychoanalyse": "psychoanalyze", + "psychoanalysed": "psychoanalyzed", + "psychoanalyses": "psychoanalyzes", + "psychoanalysing": "psychoanalyzing", + "publicise": "publicize", + "publicised": "publicized", + "publicises": "publicizes", + "publicising": "publicizing", + "pulverisation": "pulverization", + "pulverise": "pulverize", + "pulverised": "pulverized", + "pulverises": "pulverizes", + "pulverising": "pulverizing", + "pummelled": "pummel", + "pummelling": "pummeled", + "pyjama": "pajama", + "pyjamas": "pajamas", + "pzazz": "pizzazz", + "quarrelled": "quarreled", + "quarrelling": "quarreling", + "radicalise": "radicalize", + "radicalised": "radicalized", + "radicalises": "radicalizes", + "radicalising": "radicalizing", + "rancour": "rancor", + "randomise": "randomize", + "randomised": "randomized", + "randomises": "randomizes", + "randomising": "randomizing", + "rationalisation": "rationalization", + "rationalisations": "rationalizations", + "rationalise": "rationalize", + "rationalised": "rationalized", + "rationalises": "rationalizes", + "rationalising": "rationalizing", + "ravelled": "raveled", + "ravelling": "raveling", + "realisable": "realizable", + "realisation": "realization", + "realisations": "realizations", + "realise": "realize", + "realised": "realized", + "realises": "realizes", + "realising": "realizing", + "recognisable": "recognizable", + "recognisably": "recognizably", + "recognisance": "recognizance", + "recognise": "recognize", + "recognised": "recognized", + "recognises": "recognizes", + "recognising": "recognizing", + "reconnoitre": "reconnoiter", + "reconnoitred": "reconnoitered", + "reconnoitres": "reconnoiters", + "reconnoitring": "reconnoitering", + "refuelled": "refueled", + "refuelling": "refueling", + "regularisation": "regularization", + "regularise": "regularize", + "regularised": "regularized", + "regularises": "regularizes", + "regularising": "regularizing", + "remodelled": "remodeled", + "remodelling": "remodeling", + "remould": "remold", + "remoulded": "remolded", + "remoulding": "remolding", + "remoulds": "remolds", + "reorganisation": "reorganization", + "reorganisations": "reorganizations", + "reorganise": "reorganize", + "reorganised": "reorganized", + "reorganises": "reorganizes", + "reorganising": "reorganizing", + "revelled": "reveled", + "reveller": "reveler", + "revellers": "revelers", + "revelling": "reveling", + "revitalise": "revitalize", + "revitalised": "revitalized", + "revitalises": "revitalizes", + "revitalising": "revitalizing", + "revolutionise": "revolutionize", + "revolutionised": "revolutionized", + "revolutionises": "revolutionizes", + "revolutionising": "revolutionizing", + "rhapsodise": "rhapsodize", + "rhapsodised": "rhapsodized", + "rhapsodises": "rhapsodizes", + "rhapsodising": "rhapsodizing", + "rigour": "rigor", + "rigours": "rigors", + "ritualised": "ritualized", + "rivalled": "rivaled", + "rivalling": "rivaling", + "romanticise": "romanticize", + "romanticised": "romanticized", + "romanticises": "romanticizes", + "romanticising": "romanticizing", + "rumour": "rumor", + "rumoured": "rumored", + "rumours": "rumors", + "sabre": "saber", + "sabres": "sabers", + "saltpetre": "saltpeter", + "sanitise": "sanitize", + "sanitised": "sanitized", + "sanitises": "sanitizes", + "sanitising": "sanitizing", + "satirise": "satirize", + "satirised": "satirized", + "satirises": "satirizes", + "satirising": "satirizing", + "saviour": "savior", + "saviours": "saviors", + "savour": "savor", + "savoured": "savored", + "savouries": "savories", + "savouring": "savoring", + "savours": "savors", + "savoury": "savory", + "scandalise": "scandalize", + "scandalised": "scandalized", + "scandalises": "scandalizes", + "scandalising": "scandalizing", + "sceptic": "skeptic", + "sceptical": "skeptical", + "sceptically": "skeptically", + "scepticism": "skepticism", + "sceptics": "skeptics", + "sceptre": "scepter", + "sceptres": "scepters", + "scrutinise": "scrutinize", + "scrutinised": "scrutinized", + "scrutinises": "scrutinizes", + "scrutinising": "scrutinizing", + "secularisation": "secularization", + "secularise": "secularize", + "secularised": "secularized", + "secularises": "secularizes", + "secularising": "secularizing", + "sensationalise": "sensationalize", + "sensationalised": "sensationalized", + "sensationalises": "sensationalizes", + "sensationalising": "sensationalizing", + "sensitise": "sensitize", + "sensitised": "sensitized", + "sensitises": "sensitizes", + "sensitising": "sensitizing", + "sentimentalise": "sentimentalize", + "sentimentalised": "sentimentalized", + "sentimentalises": "sentimentalizes", + "sentimentalising": "sentimentalizing", + "sepulchre": "sepulcher", + "sepulchres": "sepulchers", + "serialisation": "serialization", + "serialisations": "serializations", + "serialise": "serialize", + "serialised": "serialized", + "serialises": "serializes", + "serialising": "serializing", + "sermonise": "sermonize", + "sermonised": "sermonized", + "sermonises": "sermonizes", + "sermonising": "sermonizing", + "sheikh": "sheik", + "shovelled": "shoveled", + "shovelling": "shoveling", + "shrivelled": "shriveled", + "shrivelling": "shriveling", + "signalise": "signalize", + "signalised": "signalized", + "signalises": "signalizes", + "signalising": "signalizing", + "signalled": "signaled", + "signalling": "signaling", + "smoulder": "smolder", + "smouldered": "smoldered", + "smouldering": "smoldering", + "smoulders": "smolders", + "snivelled": "sniveled", + "snivelling": "sniveling", + "snorkelled": "snorkeled", + "snorkelling": "snorkeling", + "snowplough": "snowplow", + "snowploughs": "snowplow", + "socialisation": "socialization", + "socialise": "socialize", + "socialised": "socialized", + "socialises": "socializes", + "socialising": "socializing", + "sodomise": "sodomize", + "sodomised": "sodomized", + "sodomises": "sodomizes", + "sodomising": "sodomizing", + "solemnise": "solemnize", + "solemnised": "solemnized", + "solemnises": "solemnizes", + "solemnising": "solemnizing", + "sombre": "somber", + "specialisation": "specialization", + "specialisations": "specializations", + "specialise": "specialize", + "specialised": "specialized", + "specialises": "specializes", + "specialising": "specializing", + "spectre": "specter", + "spectres": "specters", + "spiralled": "spiraled", + "spiralling": "spiraling", + "splendour": "splendor", + "splendours": "splendors", + "squirrelled": "squirreled", + "squirrelling": "squirreling", + "stabilisation": "stabilization", + "stabilise": "stabilize", + "stabilised": "stabilized", + "stabiliser": "stabilizer", + "stabilisers": "stabilizers", + "stabilises": "stabilizes", + "stabilising": "stabilizing", + "standardisation": "standardization", + "standardise": "standardize", + "standardised": "standardized", + "standardises": "standardizes", + "standardising": "standardizing", + "stencilled": "stenciled", + "stencilling": "stenciling", + "sterilisation": "sterilization", + "sterilisations": "sterilizations", + "sterilise": "sterilize", + "sterilised": "sterilized", + "steriliser": "sterilizer", + "sterilisers": "sterilizers", + "sterilises": "sterilizes", + "sterilising": "sterilizing", + "stigmatisation": "stigmatization", + "stigmatise": "stigmatize", + "stigmatised": "stigmatized", + "stigmatises": "stigmatizes", + "stigmatising": "stigmatizing", + "storey": "story", + "storeys": "stories", + "subsidisation": "subsidization", + "subsidise": "subsidize", + "subsidised": "subsidized", + "subsidiser": "subsidizer", + "subsidisers": "subsidizers", + "subsidises": "subsidizes", + "subsidising": "subsidizing", + "succour": "succor", + "succoured": "succored", + "succouring": "succoring", + "succours": "succors", + "sulphate": "sulfate", + "sulphates": "sulfates", + "sulphide": "sulfide", + "sulphides": "sulfides", + "sulphur": "sulfur", + "sulphurous": "sulfurous", + "summarise": "summarize", + "summarised": "summarized", + "summarises": "summarizes", + "summarising": "summarizing", + "swivelled": "swiveled", + "swivelling": "swiveling", + "symbolise": "symbolize", + "symbolised": "symbolized", + "symbolises": "symbolizes", + "symbolising": "symbolizing", + "sympathise": "sympathize", + "sympathised": "sympathized", + "sympathiser": "sympathizer", + "sympathisers": "sympathizers", + "sympathises": "sympathizes", + "sympathising": "sympathizing", + "synchronisation": "synchronization", + "synchronise": "synchronize", + "synchronised": "synchronized", + "synchronises": "synchronizes", + "synchronising": "synchronizing", + "synthesise": "synthesize", + "synthesised": "synthesized", + "synthesiser": "synthesizer", + "synthesisers": "synthesizers", + "synthesises": "synthesizes", + "synthesising": "synthesizing", + "syphon": "siphon", + "syphoned": "siphoned", + "syphoning": "siphoning", + "syphons": "siphons", + "systematisation": "systematization", + "systematise": "systematize", + "systematised": "systematized", + "systematises": "systematizes", + "systematising": "systematizing", + "tantalise": "tantalize", + "tantalised": "tantalized", + "tantalises": "tantalizes", + "tantalising": "tantalizing", + "tantalisingly": "tantalizingly", + "tasselled": "tasseled", + "technicolour": "technicolor", + "temporise": "temporize", + "temporised": "temporized", + "temporises": "temporizes", + "temporising": "temporizing", + "tenderise": "tenderize", + "tenderised": "tenderized", + "tenderises": "tenderizes", + "tenderising": "tenderizing", + "terrorise": "terrorize", + "terrorised": "terrorized", + "terrorises": "terrorizes", + "terrorising": "terrorizing", + "theatre": "theater", + "theatregoer": "theatergoer", + "theatregoers": "theatergoers", + "theatres": "theaters", + "theorise": "theorize", + "theorised": "theorized", + "theorises": "theorizes", + "theorising": "theorizing", + "tonne": "ton", + "tonnes": "tons", + "towelled": "toweled", + "towelling": "toweling", + "toxaemia": "toxemia", + "tranquillise": "tranquilize", + "tranquillised": "tranquilized", + "tranquilliser": "tranquilizer", + "tranquillisers": "tranquilizers", + "tranquillises": "tranquilizes", + "tranquillising": "tranquilizing", + "tranquillity": "tranquility", + "tranquillize": "tranquilize", + "tranquillized": "tranquilized", + "tranquillizer": "tranquilizer", + "tranquillizers": "tranquilizers", + "tranquillizes": "tranquilizes", + "tranquillizing": "tranquilizing", + "tranquilly": "tranquility", + "transistorised": "transistorized", + "traumatise": "traumatize", + "traumatised": "traumatized", + "traumatises": "traumatizes", + "traumatising": "traumatizing", + "travelled": "traveled", + "traveller": "traveler", + "travellers": "travelers", + "travelling": "traveling", + "travelog": "travelogue", + "travelogs": "travelogues", + "trialled": "trialed", + "trialling": "trialing", + "tricolour": "tricolor", + "tricolours": "tricolors", + "trivialise": "trivialize", + "trivialised": "trivialized", + "trivialises": "trivializes", + "trivialising": "trivializing", + "tumour": "tumor", + "tumours": "tumors", + "tunnelled": "tunneled", + "tunnelling": "tunneling", + "tyrannise": "tyrannize", + "tyrannised": "tyrannized", + "tyrannises": "tyrannizes", + "tyrannising": "tyrannizing", + "tyre": "tire", + "tyres": "tires", + "unauthorised": "unauthorized", + "uncivilised": "uncivilized", + "underutilised": "underutilized", + "unequalled": "unequaled", + "unfavourable": "unfavorable", + "unfavourably": "unfavorably", + "unionisation": "unionization", + "unionise": "unionize", + "unionised": "unionized", + "unionises": "unionizes", + "unionising": "unionizing", + "unorganised": "unorganized", + "unravelled": "unraveled", + "unravelling": "unraveling", + "unrecognisable": "unrecognizable", + "unrecognised": "unrecognized", + "unrivalled": "unrivaled", + "unsavoury": "unsavory", + "untrammelled": "untrammeled", + "urbanisation": "urbanization", + "urbanise": "urbanize", + "urbanised": "urbanized", + "urbanises": "urbanizes", + "urbanising": "urbanizing", + "utilisable": "utilizable", + "utilisation": "utilization", + "utilise": "utilize", + "utilised": "utilized", + "utilises": "utilizes", + "utilising": "utilizing", + "valour": "valor", + "vandalise": "vandalize", + "vandalised": "vandalized", + "vandalises": "vandalizes", + "vandalising": "vandalizing", + "vaporisation": "vaporization", + "vaporise": "vaporize", + "vaporised": "vaporized", + "vaporises": "vaporizes", + "vaporising": "vaporizing", + "vapour": "vapor", + "vapours": "vapors", + "verbalise": "verbalize", + "verbalised": "verbalized", + "verbalises": "verbalizes", + "verbalising": "verbalizing", + "victimisation": "victimization", + "victimise": "victimize", + "victimised": "victimized", + "victimises": "victimizes", + "victimising": "victimizing", + "videodisc": "videodisk", + "videodiscs": "videodisks", + "vigour": "vigor", + "visualisation": "visualization", + "visualisations": "visualizations", + "visualise": "visualize", + "visualised": "visualized", + "visualises": "visualizes", + "visualising": "visualizing", + "vocalisation": "vocalization", + "vocalisations": "vocalizations", + "vocalise": "vocalize", + "vocalised": "vocalized", + "vocalises": "vocalizes", + "vocalising": "vocalizing", + "vulcanised": "vulcanized", + "vulgarisation": "vulgarization", + "vulgarise": "vulgarize", + "vulgarised": "vulgarized", + "vulgarises": "vulgarizes", + "vulgarising": "vulgarizing", + "waggon": "wagon", + "waggons": "wagons", + "watercolour": "watercolor", + "watercolours": "watercolors", + "weaselled": "weaseled", + "weaselling": "weaseling", + "westernisation": "westernization", + "westernise": "westernize", + "westernised": "westernized", + "westernises": "westernizes", + "westernising": "westernizing", + "womanise": "womanize", + "womanised": "womanized", + "womaniser": "womanizer", + "womanisers": "womanizers", + "womanises": "womanizes", + "womanising": "womanizing", + "woollen": "woolen", + "woollens": "woolens", + "woollies": "woolies", + "woolly": "wooly", + "worshipped": "worshiped", + "worshipping": "worshiping", + "worshipper": "worshiper", + "yodelled": "yodeled", + "yodelling": "yodeling", + "yoghourt": "yogurt", + "yoghourts": "yogurts", + "yoghurt": "yogurt", + "yoghurts": "yogurts", + "mhm": "hmm", + "mmm": "hmm" +} \ No newline at end of file diff --git a/whisper_service_deploy/whisper/normalizers/english.py b/whisper_service_deploy/whisper/normalizers/english.py new file mode 100644 index 0000000..4932042 --- /dev/null +++ b/whisper_service_deploy/whisper/normalizers/english.py @@ -0,0 +1,550 @@ +import json +import os +import re +from fractions import Fraction +from typing import Iterator, List, Match, Optional, Union + +from more_itertools import windowed + +from .basic import remove_symbols_and_diacritics + + +class EnglishNumberNormalizer: + """ + Convert any spelled-out numbers into arabic numbers, while handling: + + - remove any commas + - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc. + - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars` + - spell out `one` and `ones` + - interpret successive single-digit numbers as nominal: `one oh one` -> `101` + """ + + def __init__(self): + super().__init__() + + self.zeros = {"o", "oh", "zero"} + self.ones = { + name: i + for i, name in enumerate( + [ + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "ten", + "eleven", + "twelve", + "thirteen", + "fourteen", + "fifteen", + "sixteen", + "seventeen", + "eighteen", + "nineteen", + ], + start=1, + ) + } + self.ones_plural = { + "sixes" if name == "six" else name + "s": (value, "s") + for name, value in self.ones.items() + } + self.ones_ordinal = { + "zeroth": (0, "th"), + "first": (1, "st"), + "second": (2, "nd"), + "third": (3, "rd"), + "fifth": (5, "th"), + "twelfth": (12, "th"), + **{ + name + ("h" if name.endswith("t") else "th"): (value, "th") + for name, value in self.ones.items() + if value > 3 and value != 5 and value != 12 + }, + } + self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal} + + self.tens = { + "twenty": 20, + "thirty": 30, + "forty": 40, + "fifty": 50, + "sixty": 60, + "seventy": 70, + "eighty": 80, + "ninety": 90, + } + self.tens_plural = { + name.replace("y", "ies"): (value, "s") for name, value in self.tens.items() + } + self.tens_ordinal = { + name.replace("y", "ieth"): (value, "th") + for name, value in self.tens.items() + } + self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal} + + self.multipliers = { + "hundred": 100, + "thousand": 1_000, + "million": 1_000_000, + "billion": 1_000_000_000, + "trillion": 1_000_000_000_000, + "quadrillion": 1_000_000_000_000_000, + "quintillion": 1_000_000_000_000_000_000, + "sextillion": 1_000_000_000_000_000_000_000, + "septillion": 1_000_000_000_000_000_000_000_000, + "octillion": 1_000_000_000_000_000_000_000_000_000, + "nonillion": 1_000_000_000_000_000_000_000_000_000_000, + "decillion": 1_000_000_000_000_000_000_000_000_000_000_000, + } + self.multipliers_plural = { + name + "s": (value, "s") for name, value in self.multipliers.items() + } + self.multipliers_ordinal = { + name + "th": (value, "th") for name, value in self.multipliers.items() + } + self.multipliers_suffixed = { + **self.multipliers_plural, + **self.multipliers_ordinal, + } + self.decimals = {*self.ones, *self.tens, *self.zeros} + + self.preceding_prefixers = { + "minus": "-", + "negative": "-", + "plus": "+", + "positive": "+", + } + self.following_prefixers = { + "pound": "£", + "pounds": "£", + "euro": "€", + "euros": "€", + "dollar": "$", + "dollars": "$", + "cent": "¢", + "cents": "¢", + } + self.prefixes = set( + list(self.preceding_prefixers.values()) + + list(self.following_prefixers.values()) + ) + self.suffixers = { + "per": {"cent": "%"}, + "percent": "%", + } + self.specials = {"and", "double", "triple", "point"} + + self.words = set( + [ + key + for mapping in [ + self.zeros, + self.ones, + self.ones_suffixed, + self.tens, + self.tens_suffixed, + self.multipliers, + self.multipliers_suffixed, + self.preceding_prefixers, + self.following_prefixers, + self.suffixers, + self.specials, + ] + for key in mapping + ] + ) + self.literal_words = {"one", "ones"} + + def process_words(self, words: List[str]) -> Iterator[str]: + prefix: Optional[str] = None + value: Optional[Union[str, int]] = None + skip = False + + def to_fraction(s: str): + try: + return Fraction(s) + except ValueError: + return None + + def output(result: Union[str, int]): + nonlocal prefix, value + result = str(result) + if prefix is not None: + result = prefix + result + value = None + prefix = None + return result + + if len(words) == 0: + return + + for prev, current, next in windowed([None] + words + [None], 3): + if skip: + skip = False + continue + + next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next) + has_prefix = current[0] in self.prefixes + current_without_prefix = current[1:] if has_prefix else current + if re.match(r"^\d+(\.\d+)?$", current_without_prefix): + # arabic numbers (potentially with signs and fractions) + f = to_fraction(current_without_prefix) + assert f is not None + if value is not None: + if isinstance(value, str) and value.endswith("."): + # concatenate decimals / ip address components + value = str(value) + str(current) + continue + else: + yield output(value) + + prefix = current[0] if has_prefix else prefix + if f.denominator == 1: + value = f.numerator # store integers as int + else: + value = current_without_prefix + elif current not in self.words: + # non-numeric words + if value is not None: + yield output(value) + yield output(current) + elif current in self.zeros: + value = str(value or "") + "0" + elif current in self.ones: + ones = self.ones[current] + + if value is None: + value = ones + elif isinstance(value, str) or prev in self.ones: + if ( + prev in self.tens and ones < 10 + ): # replace the last zero with the digit + assert value[-1] == "0" + value = value[:-1] + str(ones) + else: + value = str(value) + str(ones) + elif ones < 10: + if value % 10 == 0: + value += ones + else: + value = str(value) + str(ones) + else: # eleven to nineteen + if value % 100 == 0: + value += ones + else: + value = str(value) + str(ones) + elif current in self.ones_suffixed: + # ordinal or cardinal; yield the number right away + ones, suffix = self.ones_suffixed[current] + if value is None: + yield output(str(ones) + suffix) + elif isinstance(value, str) or prev in self.ones: + if prev in self.tens and ones < 10: + assert value[-1] == "0" + yield output(value[:-1] + str(ones) + suffix) + else: + yield output(str(value) + str(ones) + suffix) + elif ones < 10: + if value % 10 == 0: + yield output(str(value + ones) + suffix) + else: + yield output(str(value) + str(ones) + suffix) + else: # eleven to nineteen + if value % 100 == 0: + yield output(str(value + ones) + suffix) + else: + yield output(str(value) + str(ones) + suffix) + value = None + elif current in self.tens: + tens = self.tens[current] + if value is None: + value = tens + elif isinstance(value, str): + value = str(value) + str(tens) + else: + if value % 100 == 0: + value += tens + else: + value = str(value) + str(tens) + elif current in self.tens_suffixed: + # ordinal or cardinal; yield the number right away + tens, suffix = self.tens_suffixed[current] + if value is None: + yield output(str(tens) + suffix) + elif isinstance(value, str): + yield output(str(value) + str(tens) + suffix) + else: + if value % 100 == 0: + yield output(str(value + tens) + suffix) + else: + yield output(str(value) + str(tens) + suffix) + elif current in self.multipliers: + multiplier = self.multipliers[current] + if value is None: + value = multiplier + elif isinstance(value, str) or value == 0: + f = to_fraction(value) + p = f * multiplier if f is not None else None + if f is not None and p.denominator == 1: + value = p.numerator + else: + yield output(value) + value = multiplier + else: + before = value // 1000 * 1000 + residual = value % 1000 + value = before + residual * multiplier + elif current in self.multipliers_suffixed: + multiplier, suffix = self.multipliers_suffixed[current] + if value is None: + yield output(str(multiplier) + suffix) + elif isinstance(value, str): + f = to_fraction(value) + p = f * multiplier if f is not None else None + if f is not None and p.denominator == 1: + yield output(str(p.numerator) + suffix) + else: + yield output(value) + yield output(str(multiplier) + suffix) + else: # int + before = value // 1000 * 1000 + residual = value % 1000 + value = before + residual * multiplier + yield output(str(value) + suffix) + value = None + elif current in self.preceding_prefixers: + # apply prefix (positive, minus, etc.) if it precedes a number + if value is not None: + yield output(value) + + if next in self.words or next_is_numeric: + prefix = self.preceding_prefixers[current] + else: + yield output(current) + elif current in self.following_prefixers: + # apply prefix (dollars, cents, etc.) only after a number + if value is not None: + prefix = self.following_prefixers[current] + yield output(value) + else: + yield output(current) + elif current in self.suffixers: + # apply suffix symbols (percent -> '%') + if value is not None: + suffix = self.suffixers[current] + if isinstance(suffix, dict): + if next in suffix: + yield output(str(value) + suffix[next]) + skip = True + else: + yield output(value) + yield output(current) + else: + yield output(str(value) + suffix) + else: + yield output(current) + elif current in self.specials: + if next not in self.words and not next_is_numeric: + # apply special handling only if the next word can be numeric + if value is not None: + yield output(value) + yield output(current) + elif current == "and": + # ignore "and" after hundreds, thousands, etc. + if prev not in self.multipliers: + if value is not None: + yield output(value) + yield output(current) + elif current == "double" or current == "triple": + if next in self.ones or next in self.zeros: + repeats = 2 if current == "double" else 3 + ones = self.ones.get(next, 0) + value = str(value or "") + str(ones) * repeats + skip = True + else: + if value is not None: + yield output(value) + yield output(current) + elif current == "point": + if next in self.decimals or next_is_numeric: + value = str(value or "") + "." + else: + # should all have been covered at this point + raise ValueError(f"Unexpected token: {current}") + else: + # all should have been covered at this point + raise ValueError(f"Unexpected token: {current}") + + if value is not None: + yield output(value) + + def preprocess(self, s: str): + # replace " and a half" with " point five" + results = [] + + segments = re.split(r"\band\s+a\s+half\b", s) + for i, segment in enumerate(segments): + if len(segment.strip()) == 0: + continue + if i == len(segments) - 1: + results.append(segment) + else: + results.append(segment) + last_word = segment.rsplit(maxsplit=2)[-1] + if last_word in self.decimals or last_word in self.multipliers: + results.append("point five") + else: + results.append("and a half") + + s = " ".join(results) + + # put a space at number/letter boundary + s = re.sub(r"([a-z])([0-9])", r"\1 \2", s) + s = re.sub(r"([0-9])([a-z])", r"\1 \2", s) + + # but remove spaces which could be a suffix + s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s) + + return s + + def postprocess(self, s: str): + def combine_cents(m: Match): + try: + currency = m.group(1) + integer = m.group(2) + cents = int(m.group(3)) + return f"{currency}{integer}.{cents:02d}" + except ValueError: + return m.string + + def extract_cents(m: Match): + try: + return f"¢{int(m.group(1))}" + except ValueError: + return m.string + + # apply currency postprocessing; "$2 and ¢7" -> "$2.07" + s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s) + s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s) + + # write "one(s)" instead of "1(s)", just for the readability + s = re.sub(r"\b1(s?)\b", r"one\1", s) + + return s + + def __call__(self, s: str): + s = self.preprocess(s) + s = " ".join(word for word in self.process_words(s.split()) if word is not None) + s = self.postprocess(s) + + return s + + +class EnglishSpellingNormalizer: + """ + Applies British-American spelling mappings as listed in [1]. + + [1] https://www.tysto.com/uk-us-spelling-list.html + """ + + def __init__(self): + mapping_path = os.path.join(os.path.dirname(__file__), "english.json") + self.mapping = json.load(open(mapping_path)) + + def __call__(self, s: str): + return " ".join(self.mapping.get(word, word) for word in s.split()) + + +class EnglishTextNormalizer: + def __init__(self): + self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b" + self.replacers = { + # common contractions + r"\bwon't\b": "will not", + r"\bcan't\b": "can not", + r"\blet's\b": "let us", + r"\bain't\b": "aint", + r"\by'all\b": "you all", + r"\bwanna\b": "want to", + r"\bgotta\b": "got to", + r"\bgonna\b": "going to", + r"\bi'ma\b": "i am going to", + r"\bimma\b": "i am going to", + r"\bwoulda\b": "would have", + r"\bcoulda\b": "could have", + r"\bshoulda\b": "should have", + r"\bma'am\b": "madam", + # contractions in titles/prefixes + r"\bmr\b": "mister ", + r"\bmrs\b": "missus ", + r"\bst\b": "saint ", + r"\bdr\b": "doctor ", + r"\bprof\b": "professor ", + r"\bcapt\b": "captain ", + r"\bgov\b": "governor ", + r"\bald\b": "alderman ", + r"\bgen\b": "general ", + r"\bsen\b": "senator ", + r"\brep\b": "representative ", + r"\bpres\b": "president ", + r"\brev\b": "reverend ", + r"\bhon\b": "honorable ", + r"\basst\b": "assistant ", + r"\bassoc\b": "associate ", + r"\blt\b": "lieutenant ", + r"\bcol\b": "colonel ", + r"\bjr\b": "junior ", + r"\bsr\b": "senior ", + r"\besq\b": "esquire ", + # prefect tenses, ideally it should be any past participles, but it's harder.. + r"'d been\b": " had been", + r"'s been\b": " has been", + r"'d gone\b": " had gone", + r"'s gone\b": " has gone", + r"'d done\b": " had done", # "'s done" is ambiguous + r"'s got\b": " has got", + # general contractions + r"n't\b": " not", + r"'re\b": " are", + r"'s\b": " is", + r"'d\b": " would", + r"'ll\b": " will", + r"'t\b": " not", + r"'ve\b": " have", + r"'m\b": " am", + } + self.standardize_numbers = EnglishNumberNormalizer() + self.standardize_spellings = EnglishSpellingNormalizer() + + def __call__(self, s: str): + s = s.lower() + + s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets + s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis + s = re.sub(self.ignore_patterns, "", s) + s = re.sub(r"\s+'", "'", s) # when there's a space before an apostrophe + + for pattern, replacement in self.replacers.items(): + s = re.sub(pattern, replacement, s) + + s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits + s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers + s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep numeric symbols + + s = self.standardize_numbers(s) + s = self.standardize_spellings(s) + + # now remove prefix/suffix symbols that are not preceded/followed by numbers + s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s) + s = re.sub(r"([^0-9])%", r"\1 ", s) + + s = re.sub(r"\s+", " ", s) # replace any successive whitespaces with a space + + return s diff --git a/whisper_service_deploy/whisper/version.py b/whisper_service_deploy/whisper/version.py new file mode 100644 index 0000000..67426aa --- /dev/null +++ b/whisper_service_deploy/whisper/version.py @@ -0,0 +1 @@ +__version__ = "20250625"