User:CrowleyBot/task/16

維基詞典,自由的多語言詞典

技术细节[编辑]

  • 使用fishhook为str添加新方法

错误[编辑]

  • 截至20210601,部分{{wikipedia}}没有添加lang=en
  • 如果英文版章节不含IPA,中文版却有IPA(来自德文版),结果章节不含IPA。
    • 这是因为英文版的DerbethBot会往发音章节添加发音,却没有从德文版搬运IPA的功能。
  • 如果是从德文版搬运德语IPA,会遇到wußte等拼写改革前的词,目前不能处理。

输出[编辑]

源代码[编辑]

from prelude import  *
from botaccount import *

zh = mwc.Site('zh.wiktionary.org', clients_useragent = UA)
en = mwc.Site('en.wiktionary.org', clients_useragent = UA)
zh.login(UN, PWD)
en.login(UN, PWD)
SLP = 0

focustemp = zh.Pages['Template:IPA']
zpgl = list(focustemp.embed(namespace = 0))
#zpgl = list(zh.categories['有脚本错误的页面'])
pn = list(map(lambda p: p.name, zpgl))
n = len(pn)

#epgl = list(map(getepg, zpgl))
fail, success, purifylog, titlelog, successall = [], [], [], [], []
dbg, dbgout = False, None
fail1, fail2 = [], []

i, j = 0, 0 # j不是global, 没用
defaultsummary = 'Bot: 自enwikt搬运发音章节,修%s' % 'IPA与audio1'
nslist = [0]
s1, s2, s3 = set(), set(), set()
rfapde, customtext, unknown = set(), set(), set()
qualm, accent = set(), set()
audiom = []

@fct.total_ordering
class node:
    def __init__(s, kyu=0, title='', a=0, b=0, z='', dummy = 0):
        if dummy:
            s.kyu = kyu
            s.title = dummy
            return
        s.kyu, s.title, s.a, s.b, s.z = kyu, title, a, b, z
        s.l, s.f, s.r = None, None, None
    
    def __bool__(s):
        return isinstance(s.title, str)
    
    def __eq__(x, y):
        return x.kyu == y.kyu
    
    def __lt__(x, y):
        return x.kyu < y.kyu
    
    def __str__(s):
        return "%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu)
    
    def __repr__(s):
        return "%s%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu, s.z)
    
    def c(s):
        n1 = s.r
        return n1.a if n1 else n1.title
    
    def g(s):
        if _l := s.l:
            return _l.a
        else:
            return s.c()
    
    def printtree(s, i=0):
        print('  ' * i + str(s))
        if s.l:
            s.l.printtree(i + 1)
        if s.r:
            s.r.printtree(i)    
    
    def gendummy(s):
        if s.r is None:
            s.r = node(kyu=s.kyu, dummy=s.f.c())
        if s.r:
            s.r.gendummy()
        if s.l:
            s.l.gendummy()
    
    def selectson(s, k):
        f = k if callable(k) else lambda x: simp(x.title) == k
        _c = s.l
        ret = []
        while _c:
            if f(_c):
                ret.append(_c)
            _c = _c.r
        if len(ret) == 0:
            raise ValueError((11, i, str(s)))
        if len(ret) > 1:
            raise ValueError((12, i, str(s)))
        return ret[0]


    
def process1():
    # *? for non-greedy
    # =在zhwikt原则上不见于合法标题,其他地方不清楚 
    rx1 = re.compile(r'^(===*)\s*([^=]*?)\s*(===*)\s*?(\n+)', flags=re.MULTILINE)
    def _f1(m):
        # need to check sameness in zhwikt
        return node(min(len(m[1]), len(m[3])), m[2], m.start(), m.end(), m[4])
    epg = en.Pages[pn[i]]
    epn = pn[i]
    etxt = epg.text()
    if etxt == '':
        raise ValueError((13, i))
    ehl = list(map(_f1, rx1.finditer(etxt)))
    ert = node(0, '', 0, 0, '')
    _c = ert
    # 当h2下先后有h4和h3时出错
    for nd in ehl:
        while nd < _c:
            _c = _c.f
        if nd == _c:
            _c.r = nd
            nd.f = _c.f
            _c = nd
        else:
            if _c.l:
                raise ValueError((10, i))
            _c.l = nd
            nd.f = _c
            _c = nd
    ert.r = node(dummy=len(etxt))
    ert.gendummy()
    esecnode = ert.selectson('German').selectson('Pronunciation')
    estxt = etxt[esecnode.b:esecnode.c()]
    estxt = estxt.replace('{{PAGENAME}}', epn)
    def _f2(m):
        return '' if m[0] == '' else '\n'
    (estxt, _cnt) = re.subn(r'(\n?)' + re.escape('{{rfap|de}}') + r'(\n?)', _f2, estxt)
    if _cnt:
        rfapde.add(i)
    esast = wtp.parse(estxt)
    ests = esast.templates
    
    def _f3(l, r):
        if l < r and re.search(r'[^ \t\n\*,,.。;;::#]', estxt[l:r]):
            customtext.add(i)
    lastr = 0
    for t in ests:
        l, r = t._span_data[:2]
        _f3(lastr, l)
        lastr = max(lastr, r)
    _f3(lastr, len(estxt))
    
    for t in ests:
        tn = t.name.lower()
        if tn == 'ipa':
            tn = 'IPA'
        elif tn == 'enpr':
            tn = 'enPR'
        elif tn in ['rhyme', 'rhymes']:
            tn = 'rhymes'
            t.name = tn
        elif tn in ['hyph', 'hyphenation']:
            tn = 'hyphenation'
            t.name = tn
        elif tn in ['hmp', 'homophone', 'homophones']:
            tn = 'homophones'
            t.name = tn
        elif tn == 'audio':
            args = t.arguments
            dct = {'': '音频', 'austria': '音频(奥地利)', 'bavaria': '音频(巴伐利亚)', 'bavarian': '音频(巴伐利亚)', 'berlin': '音频(柏林)', 'german': '音频(德国)', 'germany': '音频(德国)'}
            if len(args) >= 3 and args[2].positional:
                if re.search(re.escape(epn[1:]), args[2].value):
                    args[2].value = '音频'
                else:
                    try:
                        args[2].value = dct[args[2].value.lower().wipecl(' ()():', ['audio'])]
                    except:
                        audiom.append((i, args[2].value))
        elif tn in ['a', 'accent', 'i', 'q', 'qual', 'qualifier']:
            if tn in ['i', 'q', 'qual', 'qualifier']:
                args = t.arguments
                for x in args:
                    if not x.value in ['overall more common; particularly northern and eastern regions', 'common form in southern Germany, Austria, and Switzerland', 'standard; used naturally in western Germany and Switzerland', 'colloquial; when unstressed by regular shortening, but also used when stressed', 'standard', 'often in fluent speech, not usually in isolation']:
                        qualm.add(x.value)
                else:
                    accent.add(x.value)
            tn = 'a'
            t.name = tn
        else:
            unknown.add(i)
    
    return esast.string

def process2():
    # *? for non-greedy
    # =在zhwikt原则上不见于合法标题,其他地方不清楚 
    rx1 = re.compile(r'^(===*)\s*([^=]*?)\s*(===*)\s*?(\n+)', flags=re.MULTILINE)
    def _f1(m):
        # need to check sameness in zhwikt
        return node(min(len(m[1]), len(m[3])), m[2], m.start(), m.end(), m[4])
    zpg = zh.Pages[pn[i]]
    zpn = pn[i]
    ztxt = zpg.text()
    zhl = list(map(_f1, rx1.finditer(ztxt)))
    zrt = node(0, '', 0, 0, '')
    _c = zrt
    # 当h2下先后有h4和h3时出错
    for nd in zhl:
        while nd < _c:
            _c = _c.f
        if nd == _c:
            _c.r = nd
            nd.f = _c.f
            _c = nd
        else:
            if _c.l:
                raise ValueError((11, i))
            _c.l = nd
            nd.f = _c
            _c = nd
    zrt.r = node(dummy=len(ztxt))
    zrt.gendummy()
    zsecnode = zrt.selectson('德语').selectson('发音')
    zret = [ztxt[0:zsecnode.b], ztxt[zsecnode.c():len(ztxt)], 0, '']
    zstxt = ztxt[zsecnode.b:zsecnode.c()]
    zstxt = zstxt.replace('{{PAGENAME}}', zpn)
    zsast = wtp.parse(zstxt)
    zsts = zsast.templates
    
    def _f3(l, r):
        if l < r and re.search(r'[^ \t\n\*,,.。;;::#]', simp(zstxt[l:r]).wipel(['奥地利', '标准', '德语', '发音', '男', '女'])):
            zret[2] = 1
    lastr = 0
    for t in zsts:
        l, r = t._span_data[:2]
        _f3(lastr, l)
        lastr = max(lastr, r)
    _f3(lastr, len(zstxt))
    if zret[2]:
        zret[3] = zstxt
        return zret
    
    for t in zsts:
        tn = t.name.lower()
        if tn in ['ipa', 'ipa1', 'ipa4']:
            pass
        elif tn == 'enpr':
            pass
        elif tn in ['rhyme', 'rhymes']:
            pass
        elif tn in ['hyph', 'hyphenation']:
            pass
        elif tn in ['hmp', 'homophone', 'homophones']:
            pass
        elif tn in ['audio', 'audio1']:
            pass
        elif tn in ['a', 'accent', 'i', 'q', 'qual', 'qualifier']:
            pass
        else:
            zret[2] = 2
    
    if zret[2]:
        zret[3] = zstxt
    
    return zret

def process():
    try:
        bb = process1()
    except ValueError as e:
        fail.append(e)
        return
    
    try:
        aa, cc, err, payload = process2()
    except ValueError as e:
        _a, _d = e.args[0][0], e.args[0][1:]
        fail.append((_a + 10,) + _d)
        tryedit(zh.Pages['User:Qnm/epg-on/%s' % pn[i]], bb)
        return
    
    if err:
        tryedit(zh.Pages['User:Qnm/diff/%s' % pn[i]], bb + "\n\n🐝👨🏻🐮🍺\n\n" + payload)
        tryedit(zh.Pages['User:Qnm/zpg/%s' % pn[i]], aa + bb + cc)
        return
    
    tryedit(zh.Pages[pn[i]], aa + bb + cc, summary=defaultsummary, fail=fail)