User:CrowleyBot/task/19

数据[编辑]

俄语第一批，涉ru-nm, ru-vb, ru-aj, ru-av
技术细节[编辑]

需要处理head模板为verb/noun form的部分
需要翻译notes之类
存在标题行为“专有名词”但章节为“名词”的情况，需要手工修复
代码[编辑]

nm、pm、aj、av、vb的获取与User:CrowleyBot/task/18类似。
from prelude import  *
from botaccount import *

zh = mwc.Site('zh.wiktionary.org', clients_useragent = UA)
en = mwc.Site('en.wiktionary.org', clients_useragent = UA)
zh.login(UN, PWD)
en.login(UN, PWD)

@fct.total_ordering
class node:
    def __init__(s, kyu=0, title='', a=0, b=0, z='', dummy = 0):
        if dummy:
            s.kyu = kyu
            s.title = dummy
            return
        s.kyu, s.title, s.a, s.b, s.z = kyu, title, a, b, z
        s.l, s.f, s.r = None, None, None
    
    def __bool__(s):
        return isinstance(s.title, str)
    
    def __eq__(x, y):
        return x.kyu == y.kyu
    
    def __lt__(x, y):
        return x.kyu < y.kyu
    
    def __str__(s):
        return "%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu)
    
    def __repr__(s):
        return "%s%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu, s.z)
    
    @property
    def c(s):
        n1 = s.r
        return n1.a if n1 else n1.title
    
    @property
    def g(s):
        if _l := s.l:
            return _l.a
        else:
            return s.c
    
    def printtree(s, i=0):
        print('  ' * i + str(s))
        if s.l:
            s.l.printtree(i + 1)
        if s.r:
            s.r.printtree(i)    
    
    def gendummy(s):
        if s.r is None:
            s.r = node(kyu=s.kyu, dummy=s.f.c)
        if s.r:
            s.r.gendummy()
        if s.l:
            s.l.gendummy()
    
    def selectson(s, k):
        f = k if callable(k) else lambda x: simp(x.title) == simp(k)
        _c = s.l
        ret = []
        while _c:
            if f(_c):
                ret.append(_c)
            _c = _c.r
        if len(ret) == 0:
            raise ValueError((11, i, str(s)))
        if len(ret) > 1:
            raise ValueError((12, i, str(s)))
        return ret[0]

def process2():
    # *? for non-greedy
    # =在zhwikt原则上不见于合法标题，其他地方不清楚 
    rx1 = re.compile(r'^(===*)\s*([^=]*?)\s*(===*)\s*?(\n+)', flags=re.MULTILINE)
    def _f1(m):
        # need to check sameness in zhwikt
        return node(min(len(m[1]), len(m[3])), m[2], m.start(), m.end(), m[4])
    #epg = en.Pages[pn[i]]
    epn = pn[i]
    etxt = todo[i]
    if etxt == '':
        #raise ValueError((13, i))
        return
    ehl = list(map(_f1, rx1.finditer(etxt)))
    ert = node(0, '', 0, 0, '')
    _c = ert
    # 当h2下先后有h4和h3时出错
    for nd in ehl:
        while nd < _c:
            _c = _c.f
        if nd == _c:
            _c.r = nd
            nd.f = _c.f
            _c = nd
        else:
            if _c.l:
                raise ValueError((10, i))
            _c.l = nd
            nd.f = _c
            _c = nd
    ert.r = node(dummy=len(etxt))
    ert.gendummy()
    
    ps = ''
    sn = simp('變格')
    if todo is nm:
        ps = 'nm'
    elif todo is pm:
        ps = 'nm'
    elif todo is aj:
        ps = 'aj'
    elif todo is av:
        ps = 'av'
    else:
        ps = 'vb'
        sn = simp('變位')
    
    _l = ert.l.a
    if ert.l.l:
        _r = ert.l.l.a
    else:
        _r = ert.l.c
    after1 = '\n'.join(its.takewhile(lambda x: not x.startswith('#'), etxt[_l:_r].split('\n')))
    after1 = after1.replace('|adjective form', '|形容詞變格形').replace('|verb form', '|動詞變位形式')
    after2 = ''
    try:
        h4 = ert.l.selectson(sn)
        after2 = etxt[h4.a:h4.c]
        after2 = after2.replace('\n----', '').strip(' \n')
    except:
        pass
    ztxtl[i] = re.sub(r'\{\{ru-%s\|[^{}]*\}\}' % ps, after1, ztxtl[i])
    ztxtl[i] = re.sub(r'\{\{ru-%s-[^{}]*\}\}' % ps, after2, ztxtl[i])
    if '|note' in ztxtl[i] or 'meanings' in ztxtl[i]:
        print((i, pn[i]))

todo=nm
for i in range(n): process2()
todo=pm
for i in range(n): process2()
todo=aj
for i in range(n): process2()
todo=av
for i in range(n): process2()
todo=vb
for i in range(n): process2()


ztxtl[204]=ztxtl[204].replace("''in the meanings \"healthy, wholesome\"'':", "義項“健康的”：")
ztxtl[204]=ztxtl[204].replace("''in the meanings \"strong, big\"'':", "義項“強壯的，大的”：")
ztxtl[266]=ztxtl[266].replace("* Dated.", "* 舊")
ztxtl[167]=ztxtl[167].replace("* Poetic.", "* 詩歌")
ztxtl[303]=ztxtl[303].replace('* The alternative stress patterns in the singular "мо́ста", "мо́сту", "мо́стом", "мо́сте" are less common.', '* 在單數"мо́ста", "мо́сту", "мо́стом", "мо́сте"中，另一種重讀形式不常見。')
ztxtl[500]=ztxtl[500].replace("* Uneducated.", "* 未受敎育的")

for i in range(n):
    if re.search(r'\n(#[^: ][^\n]*)', ztxtl[i]):
        ztxtl[i]=re.sub(r'\n#([^: ][^\n]*)', r'\n# \1', ztxtl[i])

for i in range(n):
    if re.search(r'\n(#:+)([^ ][^\n]*)', ztxtl[i]):
        ztxtl[i]=re.sub(r'\n(#:+)([^ ][^\n]*)', r'\n\1 \2', ztxtl[i])

for i in range(n):
    if re.search(r'[^\n]\n====變', ztxtl[i]):
        ztxtl[i]=re.sub(r'([^\n])\n====變', r'\1\n\n====變', ztxtl[i])

for i in range(n):
    if ztxtl[i] != ztxtlb[i]:
        tryedit(zh.Pages[pn[i]], ztxtl[i], 'Bot: [[User:CrowleyBot/task/19|为俄语条目搬运标题行]]，ru-{nm,aj,av,vb}')