doc: backport most of the readme into the sphinx doc
2 files changed, 448 insertions(+), 209 deletions(-)

M doc/conf.py
M doc/index.rst
M doc/conf.py +9 -156
@@ 1,167 1,20 @@ 
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
+# Configuration file for the Sphinx documentation builder.
 #
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-import sys
-sys.path.insert(0, '..')
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
 
 # -- Project information -----------------------------------------------------
 
 project = 'rework'
-copyright = '2018, Aurélien Campéas'
-author = 'Aurélien Campéas'
-
-# The short X.Y version
-version = '0.8.0'
-# The full version, including alpha/beta/rc tags
-release = '0.8.0'
+copyright = '2018-2023, Pythonian'
+author = 'Pythonian'
+release = '0.16.0'
 
 
 # -- General configuration ---------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.autodoc'
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+#extensions = ['sphinx.ext.autodoc', 'autoapi.extension']
+autoapi_dirs = ['../rework']
 
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
-
-# The master toctree document.
-master_doc = 'index'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
+templates_path = ['_templates']
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = None
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'alabaster'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# The default sidebars (for documents that don't match any pattern) are
-# defined by theme itself.  Builtin themes are using these templates by
-# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
-# 'searchbox.html']``.
-#
-# html_sidebars = {}
-
-
-# -- Options for HTMLHelp output ---------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'reworkdoc'
-
-
-# -- Options for LaTeX output ------------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    # 'papersize': 'letterpaper',
-
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    # 'pointsize': '10pt',
-
-    # Additional stuff for the LaTeX preamble.
-    #
-    # 'preamble': '',
-
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, 'rework.tex', 'rework Documentation',
-     'Aurélien Campéas', 'manual'),
-]
-
-
-# -- Options for manual page output ------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'rework', 'rework Documentation',
-     [author], 1)
-]
-
-
-# -- Options for Texinfo output ----------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (master_doc, 'rework', 'rework Documentation',
-     author, 'rework', 'One line description of project.',
-     'Miscellaneous'),
-]
-
-
-# -- Options for Epub output -------------------------------------------------
-
-# Bibliographic Dublin Core info.
-epub_title = project
-
-# The unique identifier of the text. This can be a ISBN number
-# or the project homepage.
-#
-# epub_identifier = ''
-
-# A unique identification for the text.
-#
-# epub_uid = ''
-
-# A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']

          
M doc/index.rst +439 -53
@@ 1,12 1,13 @@ 
+
 Rework
 ======
 
-`rework` is a distributed execution system for the execution of tasks
-that can belong to independant python environments and code bases,
-even hosted on different computers.
+``rework`` is a distributed execution system for the execution of
+tasks that can belong to independant python environments and code
+bases, even hosted on different computers.
 
 The only constraint is that postgres must be accessible from all nodes
-of a given `rework` installation.
+of a given ``rework`` installation.
 
 Rework might interest people who:
 

          
@@ 32,29 33,29 @@ Overview
 
 To use it properly one has to understand the following concepts:
 
-`operation` A python function decorated with the `task` decorator. The
-    function has a single `task` parameter that allows to communicate
-    with the system (for the purposes of input and output management,
-    and log capture). It is defined within a `domain` and on a
-    specific `host`.
+``operation`` A python function decorated with the ``task``
+    decorator. The function has a single ``task`` parameter that
+    allows to communicate with the system (for the purposes of input
+    and output management, and log capture). It is defined within a
+    ``domain`` and on a specific ``host``.
 
-`task` A concrete execution of an operation. Also, name of the
-    decorator that indicates an `operation`. The task can indicate its
-    state and be aborted if needed. It can provide access to the
+``task`` A concrete execution of an operation. Also, name of the
+    decorator that indicates an ``operation``. The task can indicate
+    its state and be aborted if needed. It can provide access to the
     captured logs, input and output.
 
-`worker` A python process spawned by a `monitor`, that will execute
-    `tasks`. It is always associated with a `domain` on a specific
-    `host`.
+``worker`` A python process spawned by a ``monitor``, that will
+    execute ``tasks``. It is always associated with a ``domain`` on a
+    specific ``host``.
 
-`domain` A label associated with `operations`, `tasks` and `workers`,
-    which can be used to map operations to virtual environments or
-    just help oprganize a logical separation of operations (and the
-    associated pools of workers).
+``domain`` A label associated with ``operations``, ``tasks`` and
+    ``workers``, which can be used to map operations to virtual
+    environments or just help organize a logical separation of
+    operations (and the associated pools of workers).
 
-`monitor` A python process which is responsible for the management of
-    workers (start, stop and abort), whose precise amount is
-    configurable, within a `domain`.
+``monitor`` A python process which is responsible for the management
+    of workers (start, stop and abort), whose precise amount is
+    configurable, within a ``domain``.
 
 They will be illustrated further in the documentation.
 

          
@@ 72,7 73,7 @@ Quick start
 
 Let's have a look at a simple example.
 
-We need to set up a database first, which we'll name `jobstore`.
+We need to set up a database first, which we'll name ``jobstore``.
 
 .. code-block:: bash
 

          
@@ 124,9 125,9 @@ This being done, we can start writing ou
 
 
 Here we have defined a dummy task that will print a bunch of
-sentences, doubler the input value and save a result back.
+sentences, double the input value and save a result back.
 
-This has to be put into a python module, e.g. `test_rework.py`
+This has to be put into a python module, e.g. ``test_rework.py``
 
 At this point, the rework system knows *nothing* of the task. We must
 register it, as follows:

          
@@ 141,7 142,7 @@ From this point, we can check it is inde
 .. code-block:: bash
 
    $ rework list-operations postgres://babar:password@localhost/jobstore
-   1 host(1) `10.211.55.3` path(my_first_task)
+   1 host(1) ``10.211.55.3`` path(my_first_task)
 
 Now, let's execute our script:
 

          
@@ 149,8 150,8 @@ Now, let's execute our script:
 
    $ python test_rework.py
 
-It will start and hang indefinitely on the first `join` call. Indeed
-we are missing an important step: providing `workers` that will
+It will start and hang indefinitely on the first ``join`` call. Indeed
+we are missing an important step: providing ``workers`` that will
 execute the tasks.
 
 This should be made in a separate shell, since it is a blocking

          
@@ 163,7 164,7 @@ operation:
 Then, the script will quickly terminate, as both tasks have been
 executed.
 
-Congratulations ! You juste fired your first rework tasks.
+Congratulations ! You juste fired your first tasks.
 We can finish this chapter with a few command line goodies.
 
 First we'll want to know about the existing tasks:

          
@@ 182,16 183,16 @@ It is possible to monitor the output of 
    stdout:INFO: 2018-11-28 16:08:27: I am running
    stdout:INFO: 2018-11-28 16:08:27: I am done
 
-The last argument `1` is the task identifier as was shown by the
-`list-tasks` command.
+The last argument ``1`` is the task identifier as was shown by the
+``list-tasks`` command.
 
 Notice how we capture the standard output (print calls) using the
-`task.capturelogs` context manager. This is completely optional of
+``task.capturelogs`` context manager. This is completely optional of
 course but quite handy. The line shown above actually capture
 *standard output*, *standard error* and *all logs*. It accepts a
-`level` parameter, like e.g. `capturelogs(level=logging.INFO)`.
+``level`` parameter, like e.g. ``capturelogs(level=logging.INFO)``.
 
-Lastly, `list-workers` will show the currently running workers:
+Lastly, ``list-workers`` will show the currently running workers:
 
 .. code-block:: bash
 

          
@@ 199,10 200,10 @@ Lastly, `list-workers` will show the cur
    1 4124@10.211.55.3 43 Mb [running (idle)] [2018-11-28 16:08:27.438491+01] → [2018-11-28 15:08:27.967432+01] 
    2 4125@10.211.55.3 43 Mb [running (idle)] [2018-11-28 16:08:27.442869+01] → [2018-11-28 15:08:27.967397+01] 
 
-It is now possible to stop the `monitor` on its separate console, with
-a plain `ctrl-c`.
+It is now possible to stop the ``monitor`` on its separate console, with
+a plain ``ctrl-c``.
 
-After this, `list-workers` will provide an updated status:
+After this, ``list-workers`` will provide an updated status:
 
 .. code-block:: bash
 

          
@@ 211,29 212,414 @@ After this, `list-workers` will provide 
    2 4125@10.211.55.3 43 Mb [dead] [2018-11-28 16:08:27.442869+01] → [2018-11-28 15:08:27.967397+01] → [2018-11-28 16:11:09.668587+01] monitor exit 
 
 
-API documentation
-=================
+Specifying inputs
+.................
+
+Having a formal declaration of the task input can help validate them
+and also, in `reworkui <https://hg.sr.ht/~pythonian/rework_ui>`_ it
+will provide an interactive web form allowing subsequent launches of
+the task.
+
+.. code-block:: python
+
+   from rework import api, io
+
+   @api.task(inputs=(
+       io.file('myfile.txt', required=True),
+       io.string('name', required=True),
+       io.string('option', choices=('foo', 'bar')),
+       io.number('weight'),
+       io.datetime('birthdate'),
+       io.moment('horizon')
+     ))
+   def compute_things(task):
+       inp = task.input
+       assert 'name' in inp
+       ...
+
+
+... and then, later:
+
+.. code-block:: python
+
+   task = api.schedule(
+       engine, 'compute_things',
+       {'myfile.txt': b'file contents',
+        'birthdate': datetime(1973, 5, 20, 9),
+        'name': 'Babar',
+        'weight': 65,
+        'horizon': '(shifted (today) #:days 7)'
+       }
+   )
 
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
+   assert task.input == {
+       'myfile.txt': b'file contents',
+       'birthdate': datetime(1973, 5, 20, 9),
+       'name': 'Babar',
+       'weight': 65,
+       'horizon': datetime(2021, 1, 7)
+   }
+
+
+Specifying outputs
+..................
+
+As for the inputs, and for the same reasons, we can provide a spec for
+the outputs.
+
+.. code-block:: python
+
+   from rework import api, io
+
+   @api.task(outputs=(
+       io.string('name'),
+       io.datetime('birthdate')
+   ))
+   def compute_things(task):
+       ...
+       task.save_output({
+           'name': 'Babar',
+           'birthdate': datetime(1931, 1, 1)
+       })
+
+
+And this will of course be fetched from the other side:
+
+.. code-block:: python
+
+   t = api.schedule(engine, 'compute_things')
+   assert t.output == {
+       'name': 'Babar',
+       'birthdate': datetime(1931, 1, 1)
+   }
+
+
+Scheduling
+..........
+
+While the base api provides a ``schedule`` call that schedules a task
+for immediate execution, there is also a ``prepare`` call that allows
+to define the exact moment the task ought to be executed, using a
+``crontab`` like notation.
 
-.. automodule:: rework.api
-   :members: task, schedule
+Example:
+
+.. code-block:: python
+
+   api.prepare(
+       engine,g
+       'compute_things',
+       {'myfile.txt': b'file contents',
+       'birthdate': datetime(1973, 5, 20, 9),
+       'name': 'Babar',
+       'weight': 65
+       },
+       rule='0 15 8,12 * * *'
+   )
+
+
+This would schedule the task every day at 8:15 and 12:15. The extended
+crontab notation also features a field for seconds (in first
+position).
+
+
+Debugging
+.........
+
+If you need to debug some task, the standard advice is:
+
+* write your task content in plain functions and have them unit-tested
+  with e.g. ``pytest``
+
+.. code-block:: python
+
+   @api.task
+   def my_fancy_task(task):
+       the_body_of_my_fancy_task(task.input)
+
+* you can also you use print-based logging as shown there:
+
+.. code-block:: python
+
+   @api.task
+   def my_fancy_task(task):
+       with task.capturelogs(std=True):
+           print('starting')
+           # do stuff
+           print('done', result)
 
-.. autoclass:: rework.task.Task
-   :members: capturelogs, join, input, raw_input, save_output, abort, metadata
+* finally, it may happen that a task is "stuck" because of a deadlock,
+  and in this case, starting the monitor with ``--debug-port`` will
+  help:
+
+.. code-block:: bash
+
+   $ pip install pystuck
+   $ rework monitor postgres://babar:password@localhost:5432/jobstore --debug-port=666
+
+Then launching ``pystuck`` (possibly from another machine) is done as
+such:
+
+.. code-block:: bash
+
+   $ pystuck -h <host> -p 666
+
+
+Organize tasks in code
+......................
+
+A common pattern is to have a ``project/tasks.py`` module.
 
-.. automodule:: rework.testutils
-   :members: workers
+One can manage the tasks using the ``register-operations`` and
+``unregister-operation`` commands.
+
+.. code-block:: bash
+
+   $ rework register-operations <dburi> /path/to/project/tasks.py
+
+and also
+
+.. code-block:: bash
+
+   rework unregister-operation <dburi> <opname>
+   delete <opname> <domain> /path/to/project/tasks.py <hostid>
+   really remove those [y/n]? [y/N]: y
+
+This pair of operations can be used also whenever a task input or
+output specifications have changed.
+
+
+API overview
+------------
+
+The ``api`` module exposes most if what is needed. The ``task`` module
+and task objects provide the rest.
 
 
-Indices and tables
-==================
+``api`` module
+..............
+
+Four functions are provided: the ``task`` decorator, the
+``freeze_operations``, ``schedule``, ``prepare`` and ``unprepare``
+functions.
+
+Defining tasks is done using the ``task`` decorator:
+
+.. code-block:: python
+
+   from rework.api import task
+
+   @task
+   def my_task(task):
+       pass
+
+It is also possible to specify a non-default ``domain``:
+
+.. code-block:: python
+
+   @task(domain='scrapers')
+   def my_scraper(task):
+       pass
+
+
+A ``timeout`` parameter is also available:
+
+.. code-block:: python
+
+   from datetime import timedelta
+
+   @task(timeout=timedelta(seconds=30)
+   def my_time_limited_task(task):
+       pass
+
+
+To make the tasks available for use, they must be recorded within the
+database referential. We use ``freeze_operations`` for this:
+
+.. code-block:: python
+
+   from sqlalchemy import create_engine
+   from rework.api import freeze_operations
+
+   engine = create_engine('postgres://babar:password@localhost:5432/jobstore')
+   api.freeze_operations(engine)
+
+
+Finally, one can schedule tasks as such:
 
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
+.. code-block:: python
+
+   from sqlalchemy import create_engine
+   from rework.api import schedule
+
+   engine = create_engine('postgres://babar:password@localhost:5432/jobstore')
+
+   # immediate executionn (the task will be queued)
+   task = api.schedule(engine, 'my_task', 42)
+
+   # execution every five minutes (the task will be queued at the
+   # specified moments)
+   api.prepare(engine, 'my_task', 42, rule='0 */5 * * * *')
+
+
+The ``schedule`` function wants these mandatory parameters:
+
+* ``engine``: sqlalchemy engine
+
+* ``operation``: string
+
+* ``inputdata``: any python picklable object (if no input
+  specification is provided, else the input formalism provides ways
+  for numbers, strings, dates and files)
+
+
+It also accepts two more options:
+
+* ``domain``: a domain identifier (for cases when the same service is
+  available under several domains and you want to force one)
+
+* ``hostid``: an host identifier (e.g. '192.168.1.1')
+
+* ``metadata``: a json-serializable dictionary (e.g. {'user':
+  'Babar'})
+
+The ``prepare`` function takes the same parameters as ``schedule``
+plus a ``rule`` option using ``crontab`` notation with seconds in
+first position.
+
+
+Task objects
+............
+
+Task objects can be obtained from the ``schedule`` api call (as seen
+in the previous example) or through the ``task`` module.
+
+.. code-block:: python
+
+   from task import Task
+
+   task = task.byid(engine, 42)
 
 
+The task object provides:
 
+* ``.state`` attribute to describe the task state (amongst:
+  ``queued``, ``running``, ``aborting``, ``aborted``, ``failed``,
+  ``done``)
+
+* ``.join()`` method to wait synchronously for the task completion
+
+* ``.capturelogs(sync=True, level=logging.NOTSET, std=False)`` method
+  to record matching logs into the db (``sync`` controls whether the
+  logs are written synchronously, ``level`` specifies the capture
+  level, ``std`` permits to also record prints as logs)
+
+* ``.input`` attribute to get the task input (yields any object)
+
+* ``.save_output(<obj>)`` method to store any object
+
+* ``.abort()`` method to preemptively stop the task
+
+* ``.log(fromid=None)`` method to retrieve the task logs (all or from
+  a given log id)
+
+
+Command line
+------------
+
+Operations
+..........
+
+If you read the previous chapter, you already know the ``init-db`` and
+``monitor`` commands.
+
+The ``rework`` command, if typed without subcommand, shows its usage:
+
+.. code-block:: shell
+
+   $ rework
+   Usage: rework [OPTIONS] COMMAND [ARGS]...
+
+   Options:
+     --help  Show this message and exit.
+
+   Commands:
+     abort-task            immediately abort the given task
+     export-scheduled
+     import-scheduled
+     init-db               initialize the database schema for rework in its...
+     kill-worker           ask to preemptively kill a given worker to its...
+     list-monitors
+     list-operations
+     list-scheduled        list the prepared operations with their cron rule
+     list-tasks
+     list-workers
+     log-task
+     monitor               start a monitor controlling min/max workers
+     new-worker            spawn a new worker -- this is a purely *internal*...
+     register-operations   register operations from a python module...
+     scheduled-plan        show what operation will be executed at which...
+     shutdown-worker       ask a worker to shut down as soon as it becomes idle
+     unprepare             remove a scheduling plan given its id
+     unregister-operation  unregister an operation (or several) using its...
+     vacuum                delete non-runing workers or finished tasks
+
+
+Of those commands, ``new-worker`` is for purely internal purposes, and
+unless you know what you're doing, you should never use it.
+
+One can list the tasks:
+
+.. code-block:: shell
+
+   rework list-tasks postgres://babar:password@localhost:5432/jobstore
+   1 my_first_task done [2017-09-13 17:08:48.306970+02]
+   2 my_first_task done [2017-09-13 17:08:48.416770+02]
+
+
+It is possible to monitor the output of a given task:
+
+.. code-block:: shell
+
+   $ rework log-task postgres://babar:password@localhost:5432/jobstore 1
+   stdout:INFO: 2017-09-13 17:08:49: I am running
+   stdout:INFO: 2017-09-13 17:08:49: I am done
+
+
+The last argument ``1`` is the task identifier as was shown by the
+``list-tasks`` command.
+
+Notice how we capture the standard output (print calls) using the
+``task.capturelogs`` context manager. This is completely optional of
+course but quite handy. The line shown above actually capture
+*standard output*, *standard error* and *all logs*. It accepts a
+``level`` parameter, like e.g. ``capturelogs(level=logging.INFO)``.
+
+Lastly, ``list-workers`` will show the currently running workers:
+
+.. code-block:: shell
+
+   $ rework list-workers postgres://babar:password@localhost:5432/jobstore
+   1 4889896@192.168.1.2 30 Mb [running]
+   2 4889748@192.168.1.2 30 Mb [running]
+
+
+Extensions
+----------
+
+It is possible to augment the ``rework`` command with new subcommands
+(or augment, modify existing commands).
+
+Any program doing so must define a new command and declare a setup
+tools entry point named ``rework:subcommand`` as in e.g.:
+
+.. code-block:: python
+
+   entry_points={'rework.subcommands': [
+       'view=rework_ui.cli:view'
+   ]}
+
+For instance, the [rework_ui][reworkui] python package provides such a
+``view`` subcommand to launch a monitoring webapp for a given rework
+job store.
+..