Error handling during XML parsing
1 files changed, 72 insertions(+), 52 deletions(-)

M meta/kursy/tabelac.py
M meta/kursy/tabelac.py +72 -52
@@ 8,6 8,7 @@ from decimal import *
 import wikipedia
 import logging
 import logging.config
+from xml.parsers.expat import ExpatError
 
 logging.config.fileConfig("/home/saper/wikipedia/log/bots.conf", disable_existing_loggers=True)
 wikipedia.logger = logging.getLogger('plwiki')

          
@@ 36,6 37,17 @@ def get_uid(doc):
 	root = doc.documentElement
 	return root.getAttribute("uid")
 
+class CannotParseItemError(Exception):
+	def __init__(self, e, url, content):
+		self.e = e
+		self.url = url
+		self.content = content
+
+	def __str__(self):
+		return "Cannot parse item fetched from '%s' with an error: \n%s" % (
+			self.url,
+			self.e)
+
 def fetch_table(feedurl, localfile):
 	""" Parse feed, compare with cached copy and return (url, pubdate, parseddomtree) tuple """
 	p = feedparser.parse(feedurl)

          
@@ 52,71 64,79 @@ def fetch_table(feedurl, localfile):
 						old_uid = None
 					
 				content = urllib.urlopen(url).read()
-				parsed = dom.parseString(content)
+				try:
+					parsed = dom.parseString(content)
 
-				if old_uid != get_uid(parsed):
-					mylogger.info("Kursy walut: uid: %s->%s" % (old_uid, get_uid(parsed)))
-					data_publikacji = parsed.getElementsByTagName("data_publikacji")[0].firstChild.nodeValue
-					if localfile:
-						wr = open(localfile, "w")	
-						wr.write(content)
-						wr.close()
-					return (url, data_publikacji, parsed)
-				else:
-					return (None, None, None)
+					if old_uid != get_uid(parsed):
+						mylogger.info("Kursy walut: uid: %s->%s" % (old_uid, get_uid(parsed)))
+						data_publikacji = parsed.getElementsByTagName("data_publikacji")[0].firstChild.nodeValue
+						if localfile:
+							wr = open(localfile, "w")
+							wr.write(content)
+							wr.close()
+						return (url, data_publikacji, parsed)
+					else:
+						return (None, None, None)
+				except ExpatError, e:
+					raise CannotParseItemError(e, url, content)
 
 TABELA_C = ( ("kod_waluty", lambda a: a), 
-		 ("przelicznik", int), 
-		 ("kurs_kupna", lambda a: Decimal(a.replace(",", "."))), 
-		 ("kurs_sprzedazy", lambda a: Decimal(a.replace(",", "."))))
+		("przelicznik", int),
+		("kurs_kupna", lambda a: Decimal(a.replace(",", "."))),
+		("kurs_sprzedazy", lambda a: Decimal(a.replace(",", "."))))
 
 TABELA_A = ( ("kod_waluty", lambda a: a), 
-		 ("przelicznik", int), 
-		 ("kurs_sredni", lambda a: Decimal(a.replace(",", "."))))
+		("przelicznik", int), 
+		("kurs_sredni", lambda a: Decimal(a.replace(",", "."))))
 
 def extract_items(domtree, tabledef):
 	return	[ tuple([op(pozycja.getElementsByTagName(tag)[0].firstChild.nodeValue)
 			for (tag, op) in tabledef
 		]) for pozycja in domtree.getElementsByTagName("pozycja") ]
 
+def main():
 
-strony_tabeli_a = []
-strony_tabeli_c = []
+	strony_tabeli_a = []
+	strony_tabeli_c = []
+
+	feedurl, localfile = ("http://rss.nbp.pl/kursy/TabelaC.xml",
+		"/home/saper/wikipedia/src/meta/kursy/tabelac.xml")
+	(url1, pubdate, parseddomtree) = fetch_table(feedurl, localfile)
+	if url1:
+		tabelac = (url1, pubdate, extract_items(parseddomtree, TABELA_C))
 
-feedurl, localfile = ("http://rss.nbp.pl/kursy/TabelaC.xml", 
-	"/home/saper/wikipedia/src/meta/kursy/tabelac.xml")
-(url1, pubdate, parseddomtree) = fetch_table(feedurl, localfile)
-if url1:
-	tabelac = (url1, pubdate, extract_items(parseddomtree, TABELA_C))
+		# (site, pagename, lastmod, comment, table, pagetext )
+		strony_tabeli_c = [
+		(meta, u"User:KursyWalut/CurrencyTable", True, TABLEACTIONMSG, tabelac,
+			tabelakursow.tabela),
+		(wikinews, u"Szablon:Kursy walut", True, TABLEACTIONMSG, tabelac,
+			tabelakursow.tabelawikinews),
+		(meta, u"Template:PLNConvert", False, TEMPLATEACTIONMSG, tabelac,
+			currencytemplate.tabela),
+		(wikinews, u"Szablon:PLNConvert", False, TEMPLATEACTIONMSG, tabelac,
+			currencytemplate.tabela),
+		]
 
-	# (site, pagename, lastmod, comment, table, pagetext )
-	strony_tabeli_c = [
-	(meta, u"User:KursyWalut/CurrencyTable", True, TABLEACTIONMSG, tabelac,
-		tabelakursow.tabela),
-	(wikinews, u"Szablon:Kursy walut", True, TABLEACTIONMSG, tabelac,
-		tabelakursow.tabelawikinews),
-	(meta, u"Template:PLNConvert", False, TEMPLATEACTIONMSG, tabelac,
-		currencytemplate.tabela),
-	(wikinews, u"Szablon:PLNConvert", False, TEMPLATEACTIONMSG, tabelac,
-		currencytemplate.tabela),
-	]
+	feedurl, localfile = ("http://rss.nbp.pl/kursy/TabelaA.xml",
+		"/home/saper/wikipedia/src/meta/kursy/tabelaa.xml")
+	(url2, pubdate, parseddomtree) = fetch_table(feedurl, localfile)
+	if url2:
+		tabelaa = (url2, pubdate, extract_items(parseddomtree, TABELA_A))
+		strony_tabeli_a = [
+		(wikinews, u"Szablon:Średnie kursy walut", True, TABLEACTIONMSG, tabelaa,
+			tabelakursow.tabelaminiwikinews),
+		]
 
-feedurl, localfile = ("http://rss.nbp.pl/kursy/TabelaA.xml", 
-	"/home/saper/wikipedia/src/meta/kursy/tabelaa.xml")
-(url2, pubdate, parseddomtree) = fetch_table(feedurl, localfile)
-if url2:
-	tabelaa = (url2, pubdate, extract_items(parseddomtree, TABELA_A))
-	strony_tabeli_a = [
-	(wikinews, u"Szablon:Średnie kursy walut", True, TABLEACTIONMSG, tabelaa,
-		tabelakursow.tabelaminiwikinews),
-	]
+	for site, pagename, lastmod, comment, table, pagetext in strony_tabeli_c + strony_tabeli_a:
+		if lastmod:
+			text = pagetext(table[2]) + LASTMODIFIEDMSG[site.language()] % (table[0], table[1])
+		else:
+			text = pagetext(table[2])
 
-for site, pagename, lastmod, comment, table, pagetext in strony_tabeli_c + strony_tabeli_a: 
-	if lastmod:
-		text = pagetext(table[2]) + LASTMODIFIEDMSG[site.language()] % (table[0], table[1])
-	else:
-		text = pagetext(table[2])
-	
-	text = text + CATEGORY[site.language()]
-	wikipedia.Page(site, pagename).put(text,
-		comment=comment[site.language()] % (table[0],), minorEdit=False)
+		text = text + CATEGORY[site.language()]
+		wikipedia.Page(site, pagename).put(text,
+			comment=comment[site.language()] % (table[0],), minorEdit=False)
+
+
+if __name__ == '__main__':
+	main()