Związki chemiczne według pierwiastków: 2022 update, Python 3
2 files changed, 76 insertions(+), 71 deletions(-)

M plwiki/chemia/comp.py
M plwiki/chemia/element.py
M plwiki/chemia/comp.py +66 -57
@@ 1,8 1,9 @@ 
 #! /usr/bin/env python
-# -*- coding: utf-8  -*-
-import pagegenerators
 import re
-import wikipedia
+import shelve
+import pywikibot
+from pywikibot import pagegenerators
+import wikitextparser
 
 from element import ElementSet
 

          
@@ 10,47 11,54 @@ import logging
 import logging.config
 
 logging.config.fileConfig("/home/saper/wikipedia/log/bots.conf", disable_existing_loggers=True)
-wikipedia.logger = logging.getLogger('plwiki')
+pywikibot.logger = logging.getLogger('plwiki')
 mywiki = logging.getLogger('chemia')
 
-MAINPAGENAME = u"Związki chemiczne według pierwiastków/Wykaz"
-MAINPAGECAT = u"Kategoria:Wikiprojekt Chemia"
-ERRORPAGENAME = MAINPAGENAME + u"/Błędy"
-ERRORPAGECAT = u"Kategoria:Problemy z hasłami chemicznymi"
-MAINTEMPLATENAME =u"Szablon:Związek chemiczny infobox" 
+MAINPAGENAME = "Związki chemiczne według pierwiastków/Wykaz"
+MAINPAGECAT = "Kategoria:Wikiprojekt Chemia"
+ERRORPAGENAME = MAINPAGENAME + "/Błędy"
+ERRORPAGECAT = "Kategoria:Problemy z hasłami chemicznymi"
+MAINTEMPLATENAME = "Związek chemiczny infobox" 
+MAINTEMPLATEPAGE = "Szablon:" + MAINTEMPLATENAME
 
 def CompoundGenerator(site, template):
-	gen = pagegenerators.ReferringPageGenerator(
-		wikipedia.Page(site, template), 
-		onlyTemplateInclusion=True)
-	return pagegenerators.PreloadingGenerator(gen, pageNumber=200)
+	gen = pywikibot.Page(site, template).embeddedin()
+	return pagegenerators.PreloadingGenerator(gen, groupsize=200)
 
-class IgnoreThis:
+class IgnoreThis(Exception):
 	pass
 
-class CannotExtractError:
+class CannotExtractError(Exception):
 	pass
 
 debug = 0
 
 class CompoundWikiReader:
 
-	def __init__(self, page,
-		pattern = re.compile(u"[wW]zór sumaryczny\s*=\s*(?P<comp>.*)", re.M)):
-
-		if page.namespace() <> 0:
+	def __init__(self, page):
+		if page.namespace() != 0:
 			raise IgnoreThis
 			
 		s = page.get()
-		m = pattern.search(s)
-		if m:
-			if m.group("comp").find("|") == -1:
-				title = page.title()
-				templatevalue = m.group("comp")
-				self.compound = Compound(title, templatevalue)
-			else:
-				raise CannotExtractError
-		else:
+		mywiki.debug("Czytamy '%s'", page.title())
+		self.compound = None
+		for template in wikitextparser.parse(s).templates:
+			mywiki.debug(".. mamy szablon '%s'", template.name.strip())
+			if template.name.strip() == MAINTEMPLATENAME:
+				
+				mywiki.debug("... jest, %d", len(template.arguments))
+				for arg in template.arguments:
+					mywiki.debug("param: %s", arg.name)
+					if arg.name.strip().lower() == "Wzór sumaryczny".lower():
+						mywiki.debug(".... mamy wzór sumaryczny: '%s'", (arg.value))
+							
+						# self.compound = (page.title(), arg.value)
+						self.compound = Compound(page.title(), arg.value)
+						break
+				break
+
+		if not self.compound:
+			mywiki.debug("'%s' nie ma wzoru sumarycznego",  page.title())
 			raise CannotExtractError
 
 class Compound:

          
@@ 78,7 86,10 @@ class Compound:
 
 	def __init__(self, arg1, arg2=None):
 		if arg2 is None:
-			(self.title, self.templatevalue) = arg1.decode("utf-8").split(u"\t")
+			try:
+				(self.title, self.templatevalue) = arg1.split("\t")
+			except ValueError as cant_unpack:
+				raise ValueError("Can't unpack \"%s\"" % arg1) from cant_unpack
 		else:
 			self.title = arg1
 			self.templatevalue = arg2

          
@@ 97,7 108,7 @@ class Compound:
 		return w
 
 	def __repr__(self):
-		return (u"\t".join([self.title, self.templatevalue])).encode("utf-8")
+		return "\t".join([self.title, self.templatevalue.strip()])
 
 	def __str__(self):
 		return self.compound.strip()

          
@@ 112,24 123,25 @@ class Compound:
 
 def extractwiki(site, reallybadfile, compgen, elements):
 	compounds = []
-	badlist = [u"==Artykuły o związkach chemicznych, w których robot nie znalazł wzoru sumarycznego==\n"]
-	for comp in compgen:
-		try:
-			c = CompoundWikiReader(comp)
-			compounds.append(c.compound)
-		except IgnoreThis:
-			pass
-		except CannotExtractError:
+	badlist = ["==Artykuły o związkach chemicznych, w których robot nie znalazł wzoru sumarycznego==\n"]
+	with shelve.open("/tmp/chemia") as db:
+		for comp in compgen:
 			try:
-				badlist.append(u"\n* [[%s]]" %  unicode(comp.title()))
-			except UnicodeDecodeError:
-				print >>reallybadfile, repr(comp.title)
+				c = CompoundWikiReader(comp)
+				compounds.append(c.compound)
+			except IgnoreThis:
+				pass
+			except CannotExtractError:
+				try:
+					badlist.append("\n* [[%s]]" %  str(comp.title()))
+				except UnicodeDecodeError:
+					print(repr(comp.title), file=reallybadfile)
 	return (compounds, badlist)
 
 def formatgroup(group, ch, elementopen, cntcur, cntcursiz):
 	if group:
 		if cntcursiz > 5:
-			group.insert(0, u"===%s<sub>%d</sub>===" % (ch.symbol, cntcur))
+			group.insert(0, "===%s<sub>%d</sub>===" % (ch.symbol, cntcur))
 		else:
 			if elementopen:
 				group.insert(0, "") # Additional newline

          
@@ 139,11 151,11 @@ def formatgroup(group, ch, elementopen, 
 
 
 def formatter(elements):
-	out = ["__NOEDITSECTION__", u"{{Spis treści}}"]
+	out = ["__NOEDITSECTION__", "{{Spis treści}}"]
 	group = []
 	for (ch, ac) in elements.allarticles():
-		out.append(u"==%s==" % unicode(ch.wikilink))
-		ac.sort()
+		out.append("==%s==" % str(ch.wikilink))
+		ac.sort(key=lambda pair: (pair[0], pair[1].compound.strip()))
 		group = []
 		cntcur = 1
 		cntcursiz = 0

          
@@ 158,7 170,7 @@ def formatter(elements):
 				cntcursiz = 0
 				cntcur = cnt
 
-			group.append(u"%s" % a.wikilink())
+			group.append("%s" % a.wikilink())
 			cntcursiz += 1
          
 		got = formatgroup(group, ch, elementopen, cntcur, cntcursiz)

          
@@ 168,23 180,22 @@ def formatter(elements):
 
 
 def processwikilive(elements):
-	errpage = wikipedia.Page(site, ERRORPAGENAME)
+	errpage = pywikibot.Page(site, ERRORPAGENAME)
 
 	reallybadfile = open("bad", "w")
-	(compounds, badlist) = extractwiki(site, reallybadfile, CompoundGenerator(site, MAINTEMPLATENAME), elements)
+	(compounds, badlist) = extractwiki(site, reallybadfile, CompoundGenerator(site, MAINTEMPLATEPAGE), elements)
 	reallybadfile.close()
 
 	compfile = open("compfile", "w")
 	for comp in compounds:
-		print >>compfile, repr(comp)
+		print(repr(comp), file=compfile)
 		elements.feed(comp, comp.elements)
 	compfile.close()
 		
 	if badlist:
 		badlist.extend(["", "[[%s]]" % ERRORPAGECAT])
-	errpage.put(u"".join(badlist), 
-		comment=u"Robot zapisuje błędy wynikłe podczas tworzenia [[%s]]" % MAINPAGENAME, minorEdit=False, botflag=False)
-	mywiki.info(u"Pierwiastki zostały przetworzone")
+	errpage.put("".join(badlist), summary="Robot zapisuje błędy wynikłe podczas tworzenia [[%s]]" % MAINPAGENAME, minorEdit=False, botflag=False)
+	mywiki.info("Pierwiastki zostały przetworzone")
 
 
 def processcompfile(elements, compfile=None):

          
@@ 201,7 212,7 @@ def loadelements(site):
 
 def runbot():
 	import sys
-	outpage = wikipedia.Page(site, MAINPAGENAME)
+	outpage = pywikibot.Page(site, MAINPAGENAME)
 	if len(sys.argv) > 1:
 		processcompfile(elements, open(sys.argv[1], "r"))
 	else:

          
@@ 209,11 220,9 @@ def runbot():
 	out = formatter(elements)
 	if out:
 		out.extend(["", "[[%s]]" % MAINPAGECAT])
-	outpage.put(u"\n".join(out), comment=u"Robot tworzy stronę na podstawie [[Specjalna:Linkujące/%s|zawartości infoboksów]]" % MAINTEMPLATENAME, 
-		minorEdit=False, botflag=False)
+	outpage.put("\n".join(out), summary="Robot tworzy stronę na podstawie [[Specjalna:Linkujące/%s|zawartości infoboksów]]" % MAINTEMPLATENAME, minorEdit=False, botflag=False)
 
-site = wikipedia.getSite('pl', 'wikipedia')
-wikipedia.setSite(site)
+site = pywikibot.Site('pl', 'wikipedia')
 elements = loadelements(site)
 if __name__ == '__main__':
 	runbot()

          
M plwiki/chemia/element.py +10 -14
@@ 1,7 1,6 @@ 
 #! /usr/bin/env python
-# -*- coding: utf-8  -*-
 import re
-import wikipedia
+import pywikibot
 
 class CannotExtractNameError:
 	pass

          
@@ 29,9 28,9 @@ class Element:
 		self.article = self.article.capitalize()
 
 		if self.name == self.article:
-			self.wikilink = u'[[%s]]' % self.name
+			self.wikilink = '[[%s]]' % self.name
 		else:
-			self.wikilink = u'[[%s|%s]]' % (self.article, self.name)
+			self.wikilink = '[[%s|%s]]' % (self.article, self.name)
 
 	def __cmp__(a, b):
 		return cmp(a.name, b.name)

          
@@ 46,8 45,8 @@ class ElementSet:
 	
 	def __init__(self, 
 			site,
-			page = u"Lista pierwiastków chemicznych", 
-			pattern = u"^\| (?P<symbol>\w{1,2}) \|\| (?P<wikilink>.*?) \|\|.*$",
+			page = "Lista pierwiastków chemicznych", 
+			pattern = "^\| (?P<symbol>\w{1,2}) \|\| (?P<wikilink>.*?) \|\|.*$",
 		):
 
 		self.wikilink = {}

          
@@ 57,7 56,7 @@ class ElementSet:
 		self.site = site
 
 		for p in re.compile(pattern, re.M).finditer(
-			wikipedia.Page(self.site, page).get()):
+			pywikibot.Page(pywikibot.Link(page)).get()):
 
 			# self.wikilink[p.group("symbol")]=p.group("wikilink")
 

          
@@ 90,9 89,7 @@ class ElementSet:
 				input = input[:sympos] + input[npos:]
 
 	def allarticles(self):
-		s = self.articles.keys()
-		s.sort()
-		for ch in s:
+		for ch in sorted(self.articles.keys(), key=lambda a: a.name):
 			if self.articles[ch]:
 				yield(ch, self.articles[ch])
 

          
@@ 110,8 107,7 @@ class ElementSet:
 		out = []
 		if not ac:
 			ac = self.articles[el]
-		ac.sort()
-		for (cnt, a) in ac:
+		for (cnt, a) in sorted(ac, lambda pair: (pair[0], pair[1].compound.strip())):
 			out.append(self.printgroup(el, cnt, a))
 		return "".join(out)
 		

          
@@ 123,9 119,9 @@ class ElementSet:
 
 
 if __name__ == '__main__':
-	q = ElementSet(wikipedia.getSite('pl', 'wikipedia'))
+	q = ElementSet(pywikibot.Site('pl', 'wikipedia'))
 	q.parse("Test", "CaCOAuOOH")
 	q.parse("Test2", "Pu")
 	q.parse("Test3", "CaHe")
 	q.parse("Test4", "ZrCaHe")
-	print unicode(q)
+	print(str(q))