# HG changeset patch # User Marcin Cieślak # Date 1657321633 0 # Fri Jul 08 23:07:13 2022 +0000 # Node ID d8e371f7a09e142f3ce1a6cf6f0a0075770853ae # Parent 6102d422b5ba5ce7f330dc8a0e9a9b005ccdac61 Związki chemiczne według pierwiastków: 2022 update, Python 3 diff --git a/plwiki/chemia/comp.py b/plwiki/chemia/comp.py --- a/plwiki/chemia/comp.py +++ b/plwiki/chemia/comp.py @@ -1,8 +1,9 @@ #! /usr/bin/env python -# -*- coding: utf-8 -*- -import pagegenerators import re -import wikipedia +import shelve +import pywikibot +from pywikibot import pagegenerators +import wikitextparser from element import ElementSet @@ -10,47 +11,54 @@ import logging.config logging.config.fileConfig("/home/saper/wikipedia/log/bots.conf", disable_existing_loggers=True) -wikipedia.logger = logging.getLogger('plwiki') +pywikibot.logger = logging.getLogger('plwiki') mywiki = logging.getLogger('chemia') -MAINPAGENAME = u"Związki chemiczne według pierwiastków/Wykaz" -MAINPAGECAT = u"Kategoria:Wikiprojekt Chemia" -ERRORPAGENAME = MAINPAGENAME + u"/Błędy" -ERRORPAGECAT = u"Kategoria:Problemy z hasłami chemicznymi" -MAINTEMPLATENAME =u"Szablon:Związek chemiczny infobox" +MAINPAGENAME = "Związki chemiczne według pierwiastków/Wykaz" +MAINPAGECAT = "Kategoria:Wikiprojekt Chemia" +ERRORPAGENAME = MAINPAGENAME + "/Błędy" +ERRORPAGECAT = "Kategoria:Problemy z hasłami chemicznymi" +MAINTEMPLATENAME = "Związek chemiczny infobox" +MAINTEMPLATEPAGE = "Szablon:" + MAINTEMPLATENAME def CompoundGenerator(site, template): - gen = pagegenerators.ReferringPageGenerator( - wikipedia.Page(site, template), - onlyTemplateInclusion=True) - return pagegenerators.PreloadingGenerator(gen, pageNumber=200) + gen = pywikibot.Page(site, template).embeddedin() + return pagegenerators.PreloadingGenerator(gen, groupsize=200) -class IgnoreThis: +class IgnoreThis(Exception): pass -class CannotExtractError: +class CannotExtractError(Exception): pass debug = 0 class CompoundWikiReader: - def __init__(self, page, - pattern = re.compile(u"[wW]zór sumaryczny\s*=\s*(?P.*)", re.M)): - - if page.namespace() <> 0: + def __init__(self, page): + if page.namespace() != 0: raise IgnoreThis s = page.get() - m = pattern.search(s) - if m: - if m.group("comp").find("|") == -1: - title = page.title() - templatevalue = m.group("comp") - self.compound = Compound(title, templatevalue) - else: - raise CannotExtractError - else: + mywiki.debug("Czytamy '%s'", page.title()) + self.compound = None + for template in wikitextparser.parse(s).templates: + mywiki.debug(".. mamy szablon '%s'", template.name.strip()) + if template.name.strip() == MAINTEMPLATENAME: + + mywiki.debug("... jest, %d", len(template.arguments)) + for arg in template.arguments: + mywiki.debug("param: %s", arg.name) + if arg.name.strip().lower() == "Wzór sumaryczny".lower(): + mywiki.debug(".... mamy wzór sumaryczny: '%s'", (arg.value)) + + # self.compound = (page.title(), arg.value) + self.compound = Compound(page.title(), arg.value) + break + break + + if not self.compound: + mywiki.debug("'%s' nie ma wzoru sumarycznego", page.title()) raise CannotExtractError class Compound: @@ -78,7 +86,10 @@ def __init__(self, arg1, arg2=None): if arg2 is None: - (self.title, self.templatevalue) = arg1.decode("utf-8").split(u"\t") + try: + (self.title, self.templatevalue) = arg1.split("\t") + except ValueError as cant_unpack: + raise ValueError("Can't unpack \"%s\"" % arg1) from cant_unpack else: self.title = arg1 self.templatevalue = arg2 @@ -97,7 +108,7 @@ return w def __repr__(self): - return (u"\t".join([self.title, self.templatevalue])).encode("utf-8") + return "\t".join([self.title, self.templatevalue.strip()]) def __str__(self): return self.compound.strip() @@ -112,24 +123,25 @@ def extractwiki(site, reallybadfile, compgen, elements): compounds = [] - badlist = [u"==Artykuły o związkach chemicznych, w których robot nie znalazł wzoru sumarycznego==\n"] - for comp in compgen: - try: - c = CompoundWikiReader(comp) - compounds.append(c.compound) - except IgnoreThis: - pass - except CannotExtractError: + badlist = ["==Artykuły o związkach chemicznych, w których robot nie znalazł wzoru sumarycznego==\n"] + with shelve.open("/tmp/chemia") as db: + for comp in compgen: try: - badlist.append(u"\n* [[%s]]" % unicode(comp.title())) - except UnicodeDecodeError: - print >>reallybadfile, repr(comp.title) + c = CompoundWikiReader(comp) + compounds.append(c.compound) + except IgnoreThis: + pass + except CannotExtractError: + try: + badlist.append("\n* [[%s]]" % str(comp.title())) + except UnicodeDecodeError: + print(repr(comp.title), file=reallybadfile) return (compounds, badlist) def formatgroup(group, ch, elementopen, cntcur, cntcursiz): if group: if cntcursiz > 5: - group.insert(0, u"===%s%d===" % (ch.symbol, cntcur)) + group.insert(0, "===%s%d===" % (ch.symbol, cntcur)) else: if elementopen: group.insert(0, "") # Additional newline @@ -139,11 +151,11 @@ def formatter(elements): - out = ["__NOEDITSECTION__", u"{{Spis treści}}"] + out = ["__NOEDITSECTION__", "{{Spis treści}}"] group = [] for (ch, ac) in elements.allarticles(): - out.append(u"==%s==" % unicode(ch.wikilink)) - ac.sort() + out.append("==%s==" % str(ch.wikilink)) + ac.sort(key=lambda pair: (pair[0], pair[1].compound.strip())) group = [] cntcur = 1 cntcursiz = 0 @@ -158,7 +170,7 @@ cntcursiz = 0 cntcur = cnt - group.append(u"%s" % a.wikilink()) + group.append("%s" % a.wikilink()) cntcursiz += 1 got = formatgroup(group, ch, elementopen, cntcur, cntcursiz) @@ -168,23 +180,22 @@ def processwikilive(elements): - errpage = wikipedia.Page(site, ERRORPAGENAME) + errpage = pywikibot.Page(site, ERRORPAGENAME) reallybadfile = open("bad", "w") - (compounds, badlist) = extractwiki(site, reallybadfile, CompoundGenerator(site, MAINTEMPLATENAME), elements) + (compounds, badlist) = extractwiki(site, reallybadfile, CompoundGenerator(site, MAINTEMPLATEPAGE), elements) reallybadfile.close() compfile = open("compfile", "w") for comp in compounds: - print >>compfile, repr(comp) + print(repr(comp), file=compfile) elements.feed(comp, comp.elements) compfile.close() if badlist: badlist.extend(["", "[[%s]]" % ERRORPAGECAT]) - errpage.put(u"".join(badlist), - comment=u"Robot zapisuje błędy wynikłe podczas tworzenia [[%s]]" % MAINPAGENAME, minorEdit=False, botflag=False) - mywiki.info(u"Pierwiastki zostały przetworzone") + errpage.put("".join(badlist), summary="Robot zapisuje błędy wynikłe podczas tworzenia [[%s]]" % MAINPAGENAME, minorEdit=False, botflag=False) + mywiki.info("Pierwiastki zostały przetworzone") def processcompfile(elements, compfile=None): @@ -201,7 +212,7 @@ def runbot(): import sys - outpage = wikipedia.Page(site, MAINPAGENAME) + outpage = pywikibot.Page(site, MAINPAGENAME) if len(sys.argv) > 1: processcompfile(elements, open(sys.argv[1], "r")) else: @@ -209,11 +220,9 @@ out = formatter(elements) if out: out.extend(["", "[[%s]]" % MAINPAGECAT]) - outpage.put(u"\n".join(out), comment=u"Robot tworzy stronę na podstawie [[Specjalna:Linkujące/%s|zawartości infoboksów]]" % MAINTEMPLATENAME, - minorEdit=False, botflag=False) + outpage.put("\n".join(out), summary="Robot tworzy stronę na podstawie [[Specjalna:Linkujące/%s|zawartości infoboksów]]" % MAINTEMPLATENAME, minorEdit=False, botflag=False) -site = wikipedia.getSite('pl', 'wikipedia') -wikipedia.setSite(site) +site = pywikibot.Site('pl', 'wikipedia') elements = loadelements(site) if __name__ == '__main__': runbot() diff --git a/plwiki/chemia/element.py b/plwiki/chemia/element.py --- a/plwiki/chemia/element.py +++ b/plwiki/chemia/element.py @@ -1,7 +1,6 @@ #! /usr/bin/env python -# -*- coding: utf-8 -*- import re -import wikipedia +import pywikibot class CannotExtractNameError: pass @@ -29,9 +28,9 @@ self.article = self.article.capitalize() if self.name == self.article: - self.wikilink = u'[[%s]]' % self.name + self.wikilink = '[[%s]]' % self.name else: - self.wikilink = u'[[%s|%s]]' % (self.article, self.name) + self.wikilink = '[[%s|%s]]' % (self.article, self.name) def __cmp__(a, b): return cmp(a.name, b.name) @@ -46,8 +45,8 @@ def __init__(self, site, - page = u"Lista pierwiastków chemicznych", - pattern = u"^\| (?P\w{1,2}) \|\| (?P.*?) \|\|.*$", + page = "Lista pierwiastków chemicznych", + pattern = "^\| (?P\w{1,2}) \|\| (?P.*?) \|\|.*$", ): self.wikilink = {} @@ -57,7 +56,7 @@ self.site = site for p in re.compile(pattern, re.M).finditer( - wikipedia.Page(self.site, page).get()): + pywikibot.Page(pywikibot.Link(page)).get()): # self.wikilink[p.group("symbol")]=p.group("wikilink") @@ -90,9 +89,7 @@ input = input[:sympos] + input[npos:] def allarticles(self): - s = self.articles.keys() - s.sort() - for ch in s: + for ch in sorted(self.articles.keys(), key=lambda a: a.name): if self.articles[ch]: yield(ch, self.articles[ch]) @@ -110,8 +107,7 @@ out = [] if not ac: ac = self.articles[el] - ac.sort() - for (cnt, a) in ac: + for (cnt, a) in sorted(ac, lambda pair: (pair[0], pair[1].compound.strip())): out.append(self.printgroup(el, cnt, a)) return "".join(out) @@ -123,9 +119,9 @@ if __name__ == '__main__': - q = ElementSet(wikipedia.getSite('pl', 'wikipedia')) + q = ElementSet(pywikibot.Site('pl', 'wikipedia')) q.parse("Test", "CaCOAuOOH") q.parse("Test2", "Pu") q.parse("Test3", "CaHe") q.parse("Test4", "ZrCaHe") - print unicode(q) + print(str(q))