873e0c1a9533 — Marcin Cieślak 6 years ago
Harvest template parameters
1 files changed, 133 insertions(+), 0 deletions(-)

A => plwiki/infobox/harvest.py
A => plwiki/infobox/harvest.py +133 -0
@@ 0,0 1,133 @@ 
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+Usage:
+
+python harvest.py -lang:nl -cat:Sisoridae -template:"Taxobox straalvinnige" -namespace:0 orde P70 familie P71 geslacht P74
+"""
+#
+# (C) 2013 Multichill, Amir
+# (C) 2013 Pywikibot team
+#
+# Distributed under the terms of MIT License.
+#
+__version__ = '$Id: 8cb29d4354089df2d1237a57248c18d410a7c3d7 $'
+#
+
+import re
+import pywikibot
+import pagegenerators
+import codecs
+
+
+class HarvestRobot:
+    """
+    A bot to extract template data
+    """
+    def __init__(self, generator, templateTitle, fields, outfile):
+        """
+        Arguments:
+            * generator     - A generator that yields Page objects.
+            * templateTitle - The template to work on
+            * fields        - A dictionary of fields that are of use to us
+
+        """
+        self.generator = generator
+        self.templateTitle = templateTitle.replace(u'_', u' ')
+        self.pregen = pagegenerators.PreloadingGenerator(generator)
+        self.fields = fields
+        self.site = pywikibot.getSite()
+        self.outfile = outfile
+
+    def setSource(self, lang):
+        """ Get the source """
+        source_values = {
+            'en':  'Q328',
+            'sv':  'Q169514',
+            'de':  'Q48183',
+            'it':  'Q11920',
+            'no':  'Q191769',
+            'fa':  'Q48952',
+            'ar':  'Q199700',
+            'es':  'Q8449',
+            'pl':  'Q1551807',
+            'ca':  'Q199693',
+            'fr':  'Q8447',
+            'nl':  'Q10000',
+            'pt':  'Q11921',
+            'ru':  'Q206855',
+            'vi':  'Q200180',
+            'be':  'Q877583',
+            'uk':  'Q199698',
+            'tr':  'Q58255',
+            'cs':  'Q191168',
+        }  # TODO: Should be moved to a central wikidata library
+
+        if lang in source_values:
+            source = ('143', source_values.get(lang))
+            return source
+        else:
+            return None
+
+    def run(self):
+        """
+        Starts the robot.
+        """
+        for page in self.pregen:
+            self.procesPage(page)
+
+    def procesPage(self, page):
+        """
+        Proces a single page
+        """
+        pywikibot.output('Processing %s' % page)
+        pagetext = page.get()
+        pagetext = pywikibot.removeDisabledParts(pagetext)
+        templates = pywikibot.extract_templates_and_params(pagetext)
+        for (template, fielddict) in templates:
+                # We found the template we were looking for
+                if template.replace(u'_', u' ') == self.templateTitle:
+                    for field, value in fielddict.items():
+                        # This field contains something useful for us
+                        if field in self.fields and value: 
+							self.outfile.write(u"%s\t%s\n" % (page, value))
+
+
+def main():
+    genFactory = pagegenerators.GeneratorFactory()
+    commandline_arguments = list()
+    templateTitle = u''
+    for arg in pywikibot.handleArgs():
+        if arg.startswith('-template'):
+            if len(arg) == 9:
+                templateTitle = pywikibot.input(
+                    u'Please enter the template to work on:')
+            else:
+                templateTitle = arg[10:]
+        elif genFactory.handleArg(arg):
+            continue
+        else:
+            commandline_arguments.append(arg)
+
+    if len(commandline_arguments) % 2 or not templateTitle:
+        raise ValueError  # or something.
+    fields = dict()
+
+    for i in xrange(0, len(commandline_arguments), 1):
+        fields[commandline_arguments[i]] = 1
+    if templateTitle:
+        gen = pagegenerators.ReferringPageGenerator(
+            pywikibot.Page(pywikibot.getSite(),
+                           "Template:%s" % templateTitle),
+            onlyTemplateInclusion=True)
+    else:
+        gen = genFactory.getCombinedGenerator()
+    if not gen:
+        # TODO: Build a transcluding generator based on templateTitle
+        return
+
+    bot = HarvestRobot(gen, templateTitle, fields, codecs.open("output.txt", "w", "utf-8"))
+    bot.run()
+
+if __name__ == "__main__":
+    main()