User:CrowleyBot/task/4

維基詞典,自由的多語言詞典

技术细节[编辑]

  • 使用enwikt对应章节的模板替换zhwikt对应模板
  • 其实对语言限定的模板而言,不需要章节对应,但为了泛用性而保留相关代码
  • process函数只会处理zh与en均一个模板的情况,或en多个模板完全一致的情况。其他情况输出至fail1, fail2
  • dryprocess观察zh与en指定模板,方便下一步处理
  • forceprocess使用en的第一个模板,或给定字符串替换zh的所有指定模板
  • concordprocess处理zh与en模板数相同的情况,务必在对应时使用。

源代码[编辑]

from prelude import  *
from botaccount import *

zh = mwc.Site('zh.wiktionary.org', clients_useragent = UA)
en = mwc.Site('en.wiktionary.org', clients_useragent = UA)
zh.login(UN, PWD)
en.login(UN, PWD)
SLP = 0

focustemp = zh.Pages['Template:la-noun']
zpgl = list(focustemp.embed(namespace = 0))
#zpgl = list(zh.categories['有脚本错误的页面'])
pn = list(map(lambda p: p.name, zpgl))
n = len(zpgl)

#epgl = list(map(getepg, zpgl))
fail, success, purifylog, titlelog, successall = [], [], [], [], []
dbg, dbgout = False, None
fail1, fail2 = [], []

i, j = 0, 0 # j不是global, 没用
focus = "la-noun"
subtitle = 'Latin'
re1 = r'\{\{' + r'[%s%s]' % (focus[0].upper(), focus[0].lower()) + focus[1:] + r'(?=[\|\}])'
re2 = '{{%s' % focus
re3 = r'\{\{' + focus + r'(?=[\|\}])'
re4 = re3 + r'[^{}]*\}\}'
defaultsummary = 'Bot: 自enwikt修%s' % focus
nslist = [0]

def process():
    zpg = zh.Pages[pn[i]]
    epg = en.Pages[pn[i]]
    if zpg.namespace not in nslist:
        return
    ztext = zpg.text()
    ztext = re.sub(re1, re2, ztext)
    it = list(re.finditer(re3, ztext))
    if len(it) == 0:
        return
    if len(it) > 1:
        print((i, pn[i]))
        fail1.append((i, pn[i]))
        return
    east = wtp.parse(epg.text())
    etmatch = []
    for j, esec in enumerate(east.get_sections(level = 2)):
        esectitle = esec.title.strip(' []') #不放心
        if esectitle != subtitle:
            continue
        ets = esec.templates
        for k, et in enumerate(ets):
            if et.name == focus:
                etmatch.append(et.string)
    if len(etmatch) >= 1 and its.all_equal(etmatch):
        etm = etmatch[0]
        print((0, i, etm))
        ntxt = re.sub(re3, '{{subst:realtemplate|faketemplate=%s' % etm, ztext, count = 1)
        tryedit(zpg, ntxt, summary=defaultsummary, fail=fail)
    else:
        print((2, i, pn[i]))
        fail2.append((i, pn[i]))
        for x in etmatch:
            print(x)

def dryprocess():
    zpg = zh.Pages[pn[i]]
    epg = en.Pages[pn[i]]
    if zpg.namespace not in nslist:
        return
    ztext = zpg.text()
    ztext = re.sub(re1, re2, ztext)
    it = list(re.finditer(re4, ztext))
    print('zh %d [[%s]]' % (i, pn[i]))
    for x in it:
        print(x.group(0))
    east = wtp.parse(epg.text())
    etmatch = []
    for j, esec in enumerate(east.get_sections(level = 2)):
        esectitle = esec.title.strip(' []') #不放心
        if esectitle != subtitle:
            continue
        ets = esec.templates
        for k, et in enumerate(ets):
            if et.name == focus:
                etmatch.append(et.string)
    print('en [[:en:%s]]' % pn[i])
    for x in etmatch:
        print(x)

def forceprocess(dest = ""):
    zpg = zh.Pages[pn[i]]
    epg = en.Pages[pn[i]]
    if zpg.namespace not in nslist:
        return
    ztext = zpg.text()
    ztext = re.sub(re1, re2, ztext)
    it = list(re.finditer(re3, ztext))
    if len(it) == 0:
        return
    if dest:
        zpg.edit(re.sub(re3, '{{subst:realtemplate|faketemplate=%s' % dest, ztext), defaultsummary)
        return
    east = wtp.parse(epg.text())
    etmatch = []
    for j, esec in enumerate(east.get_sections(level = 2)):
        esectitle = esec.title.strip(' []') #不放心
        if esectitle != subtitle:
            continue
        ets = esec.templates
        for k, et in enumerate(ets):
            if et.name == focus:
                etmatch.append(et.string)
    if len(etmatch) >= 1:
        zpg.edit(re.sub(re3, '{{subst:realtemplate|faketemplate=%s' % etmatch[0], ztext), defaultsummary)

# TODO: 最好输出zh内容
def concordprocess():
    zpg = zh.Pages[pn[i]]
    epg = en.Pages[pn[i]]
    if zpg.namespace not in nslist:
        return
    ztext = zpg.text()
    ztext = re.sub(re1, re2, ztext)
    it = list(re.finditer(re3, ztext))
    if len(it) == 0:
        return
    east = wtp.parse(epg.text())
    etmatch = []
    for j, esec in enumerate(east.get_sections(level = 2)):
        esectitle = esec.title.strip(' []') #不放心
        if esectitle != subtitle:
            continue
        ets = esec.templates
        for k, et in enumerate(ets):
            if et.name == focus:
                etmatch.append(et.string)
    if len(etmatch) == len(it):
        for j in range(len(it)):
            ztext = re.sub(re3, '{{subst:realtemplate|faketemplate=%s' % fake(etmatch[j]), ztext, count = 1)
        zpg.edit(ztext, defaultsummary)
    else:
        print((i, "different length"))