rev: tip css/lib/python/cs/lex.py -rw-r--r-- 58.9 KiB View raw Log this file
eb67c8bf923aCameron Simpson cs.ogre: update imports a day ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
#!/usr/bin/python

r'''
Lexical analysis functions, tokenisers, transcribers:
an arbitrary assortment of lexical and tokenisation functions useful
for writing recursive descent parsers, of which I have several.
There are also some transcription functions for producing text
from various objects, such as `hexify` and `unctrl`.

Generally the get_* functions accept a source string and an offset
(usually optional, default `0`) and return a token and the new offset,
raising `ValueError` on failed tokenisation.
'''

# pylint: disable=too-many-lines

import binascii
from functools import partial
from json import JSONEncoder
import os
from pathlib import Path, PurePosixPath, PureWindowsPath
import re
from string import (
    ascii_letters,
    ascii_uppercase,
    digits,
    printable,
    whitespace,
    Formatter,
)
import sys
from textwrap import dedent
from threading import Lock

from typeguard import typechecked

from cs.deco import fmtdoc, decorator
from cs.gimmicks import warning
from cs.pfx import Pfx, pfx_call, pfx_method
from cs.py.func import funcname
from cs.py3 import bytes, ustr, sorted, StringTypes, joinbytes  # pylint: disable=redefined-builtin
from cs.seq import common_prefix_length, common_suffix_length

__version__ = '20210913-post'

DISTINFO = {
    'keywords': ["python2", "python3"],
    'classifiers': [
        "Programming Language :: Python",
        "Programming Language :: Python :: 2",
        "Programming Language :: Python :: 3",
    ],
    'install_requires': [
        'cs.deco',
        'cs.gimmicks',
        'cs.pfx',
        'cs.py.func',
        'cs.py3',
        'cs.seq>=20200914',
        'typeguard',
    ],
}

unhexify = binascii.unhexlify  # pylint: disable=c-extension-no-member
hexify = binascii.hexlify  # pylint: disable=c-extension-no-member
if sys.hexversion >= 0x030000:
  _hexify = hexify

  # pylint: disable=function-redefined
  def hexify(bs):
    ''' A flavour of `binascii.hexlify` returning a `str`.
    '''
    return _hexify(bs).decode()

ord_space = ord(' ')

# pylint: disable=too-many-branches,redefined-outer-name
def unctrl(s, tabsize=8):
  ''' Return the string `s` with `TAB`s expanded and control characters
      replaced with printable representations.
  '''
  if tabsize < 1:
    raise ValueError("tabsize(%r) < 1" % (tabsize,))
  s2 = ''
  sofar = 0
  for i, ch in enumerate(s):
    ch2 = None
    if ch == '\t':
      if sofar < i:
        s2 += s[sofar:i]
        sofar = i
      ch2 = ' ' * (tabsize - (len(s2) % tabsize))
    elif ch == '\f':
      ch2 = '\\f'
    elif ch == '\n':
      ch2 = '\\n'
    elif ch == '\r':
      ch2 = '\\r'
    elif ch == '\v':
      ch2 = '\\v'
    else:
      o = ord(ch)
      if o < ord_space or printable.find(ch) == -1:
        if o >= 256:
          ch2 = "\\u%04x" % o
        else:
          ch2 = "\\%03o" % o

    if ch2 is not None:
      if sofar < i:
        s2 += s[sofar:i]
      s2 += ch2
      sofar = i + 1

  if sofar < len(s):
    s2 += s[sofar:]

  return s2.expandtabs(tabsize)

def lc_(value):
  ''' Return `value.lower()`
      with `'-'` translated into `'_'` and `' '` translated into `'-'`.

      I use this to construct lowercase filenames containing a
      readable transcription of a title string.

      See also `titleify_lc()`, an imperfect reversal of this.
  '''
  return value.lower().replace('-', '_').replace(' ', '-')

def titleify_lc(value_lc):
  ''' Translate `'-'` into `' '` and `'_'` translated into `'-'`,
      then titlecased.

      See also `lc_()`, which this reverses imperfectly.
  '''
  return value_lc.replace('-', ' ').replace('_', '-').title()

def tabpadding(padlen, tabsize=8, offset=0):
  ''' Compute some spaces to use a tab padding at an offfset.
  '''
  pad = ''
  nexttab = tabsize - offset % tabsize
  while nexttab <= padlen:
    pad += '\t'
    padlen -= nexttab
    nexttab = tabsize

  if padlen > 0:
    pad += "%*s" % (padlen, ' ')

  return pad

def typed_str(o, use_cls=False, use_repr=False, max_length=None):
  ''' Return "type(o).__name__:str(o)" for some object `o`.
      This is available as both `typed_str` and `s`.

      Parameters:
      * `use_cls`: default `False`;
        if true, use `str(type(o))` instead of `type(o).__name__`
      * `use_repr`: default `False`;
        if true, use `repr(o)` instead of `str(o)`

      I use this a lot when debugging. Example:

          from cs.lex import typed_str as s
          ......
          X("foo = %s", s(foo))
  '''
  # pylint: disable=redefined-outer-name
  s = "%s:%s" % (
      type(o) if use_cls else type(o).__name__,
      repr(o) if use_repr else str(o),
  )
  if max_length is not None:
    s = cropped(s, max_length)
  return s

# convenience alias
s = typed_str

def typed_repr(o, use_cls=False, max_length=None):
  ''' Like `typed_str` but using `repr` instead of `str`.
      This is available as both `typed_repr` and `r`.
  '''
  return typed_str(o, use_cls=use_cls, max_length=max_length, use_repr=True)

# convenience alias
r = typed_repr

def strlist(ary, sep=", "):
  ''' Convert an iterable to strings and join with `sep` (default `', '`).
  '''
  return sep.join([str(a) for a in ary])

# pylint: disable=redefined-outer-name
def htmlify(s, nbsp=False):
  ''' Convert a string for safe transcription in HTML.

      Parameters:
      * `s`: the string
      * `nbsp`: replaces spaces with `"&nbsp;"` to prevent word folding,
        default `False`.
  '''
  s = s.replace("&", "&amp;")
  s = s.replace("<", "&lt;")
  s = s.replace(">", "&gt;")
  if nbsp:
    s = s.replace(" ", "&nbsp;")
  return s

def htmlquote(s):
  ''' Quote a string for use in HTML.
  '''
  s = htmlify(s)
  s = s.replace("\"", "&dquot;")
  return "\"" + s + "\""

def jsquote(s):
  ''' Quote a string for use in JavaScript.
  '''
  s = s.replace("\"", "&dquot;")
  return "\"" + s + "\""

def phpquote(s):
  ''' Quote a string for use in PHP code.
  '''
  return "'" + s.replace('\\', '\\\\').replace("'", "\\'") + "'"

# characters that may appear in text sections of a texthexify result
# Notable exclusions:
#  \ - to avoid double in slosh escaped presentation
#  % - likewise, for percent escaped presentation
#  [ ] - the delimiters of course
#  { } - used for JSON data and some other markup
#  / - path separator
#
_texthexify_white_chars = ascii_letters + digits + '_-+.,'

def texthexify(bs, shiftin='[', shiftout=']', whitelist=None):
  ''' Transcribe the bytes `bs` to text using compact text runs for
      some common text values.

      This can be reversed with the `untexthexify` function.

      This is an ad doc format devised to be compact but also to
      expose "text" embedded within to the eye. The original use
      case was transcribing a binary directory entry format, where
      the filename parts would be somewhat visible in the transcription.

      The output is a string of hexadecimal digits for the encoded
      bytes except for runs of values from the whitelist, which are
      enclosed in the shiftin and shiftout markers and transcribed
      as is. The default whitelist is values of the ASCII letters,
      the decimal digits and the punctuation characters '_-+.,'.
      The default shiftin and shiftout markers are '[' and ']'.

      String objects converted with either `hexify` and `texthexify`
      output strings may be freely concatenated and decoded with
      `untexthexify`.

      Example:

          >>> texthexify(b'&^%&^%abcdefghi)(*)(*')
          '265e25265e25[abcdefghi]29282a29282a'

      Parameters:
      * `bs`: the bytes to transcribe
      * `shiftin`: Optional. The marker string used to indicate a shift to
        direct textual transcription of the bytes, default: `'['`.
      * `shiftout`: Optional. The marker string used to indicate a
        shift from text mode back into hexadecimal transcription,
        default `']'`.
      * `whitelist`: an optional bytes or string object indicating byte
        values which may be represented directly in text;
        the default value is the ASCII letters, the decimal digits
        and the punctuation characters `'_-+.,'`.
  '''
  if whitelist is None:
    whitelist = _texthexify_white_chars
  if isinstance(whitelist, StringTypes) and not isinstance(whitelist, bytes):
    whitelist = bytes(ord(ch) for ch in whitelist)
  inout_len = len(shiftin) + len(shiftout)
  chunks = []
  offset = 0
  offset0 = offset
  inwhite = False
  while offset < len(bs):
    b = bs[offset]
    if inwhite:
      if b not in whitelist:
        inwhite = False
        if offset - offset0 > inout_len:
          # gather up whitelist span if long enough to bother
          chunk = (
              shiftin + ''.join(chr(bs[o])
                                for o in range(offset0, offset)) + shiftout
          )
        else:
          # transcribe as hex anyway - too short
          chunk = hexify(bs[offset0:offset])
        chunks.append(chunk)
        offset0 = offset
    else:
      if b in whitelist:
        inwhite = True
        chunk = hexify(bs[offset0:offset])
        chunks.append(chunk)
        offset0 = offset
    offset += 1
  if offset > offset0:
    if inwhite and offset - offset0 > inout_len:
      chunk = (
          shiftin + ''.join(chr(bs[o])
                            for o in range(offset0, offset)) + shiftout
      )
    else:
      chunk = hexify(bs[offset0:offset])
    chunks.append(chunk)
  return ''.join(chunks)

# pylint: disable=redefined-outer-name
def untexthexify(s, shiftin='[', shiftout=']'):
  ''' Decode a textual representation of binary data into binary data.

      This is the reverse of the `texthexify` function.

      Outside of the `shiftin`/`shiftout` markers the binary data
      are represented as hexadecimal. Within the markers the bytes
      have the values of the ordinals of the characters.

      Example:

          >>> untexthexify('265e25265e25[abcdefghi]29282a29282a')
          b'&^%&^%abcdefghi)(*)(*'

      Parameters:
      * `s`: the string containing the text representation.
      * `shiftin`: Optional. The marker string commencing a sequence
        of direct text transcription, default `'['`.
      * `shiftout`: Optional. The marker string ending a sequence
        of direct text transcription, default `']'`.
  '''
  chunks = []
  while s:
    hexlen = s.find(shiftin)
    if hexlen < 0:
      break
    if hexlen > 0:
      hextext = s[:hexlen]
      if hexlen % 2 != 0:
        raise ValueError("uneven hex sequence %r" % (hextext,))
      chunks.append(unhexify(s[:hexlen]))
    s = s[hexlen + len(shiftin):]
    textlen = s.find(shiftout)
    if textlen < 0:
      raise ValueError("missing shift out marker %r" % (shiftout,))
    if sys.hexversion < 0x03000000:
      chunks.append(s[:textlen])
    else:
      chunks.append(bytes(ord(c) for c in s[:textlen]))
    s = s[textlen + len(shiftout):]
  if s:
    if len(s) % 2 != 0:
      raise ValueError("uneven hex sequence %r" % (s,))
    chunks.append(unhexify(s))
  return joinbytes(chunks)

# pylint: disable=redefined-outer-name
def get_chars(s, offset, gochars):
  ''' Scan the string `s` for characters in `gochars` starting at `offset`.
      Return `(match,new_offset)`.

      `gochars` may also be a callable, in which case a character
      `ch` is accepted if `gochars(ch)` is true.
  '''
  ooffset = offset
  if callable(gochars):
    while offset < len(s) and gochars(s[offset]):
      offset += 1
  else:
    while offset < len(s) and s[offset] in gochars:
      offset += 1
  return s[ooffset:offset], offset

# pylint: disable=redefined-outer-name
def get_white(s, offset=0):
  ''' Scan the string `s` for characters in `string.whitespace`
      starting at `offset` (default `0`).
      Return `(match,new_offset)`.
  '''
  return get_chars(s, offset, whitespace)

# pylint: disable=redefined-outer-name
def skipwhite(s, offset=0):
  ''' Convenience routine for skipping past whitespace;
      returns the offset of the next nonwhitespace character.
  '''
  _, offset = get_white(s, offset=offset)
  return offset

def stripped_dedent(s):
  ''' Slightly smarter dedent which ignores a string's opening indent.

      Algorithm:
      strip the supplied string `s`, pull off the leading line,
      dedent the rest, put back the leading line.

      This supports my preferred docstring layout, where the opening
      line of text is on the same line as the opening quote.

      Example:

          >>> def func(s):
          ...   """ Slightly smarter dedent which ignores a string's opening indent.
          ...       Strip the supplied string `s`. Pull off the leading line.
          ...       Dedent the rest. Put back the leading line.
          ...   """
          ...   pass
          ...
          >>> from cs.lex import stripped_dedent
          >>> print(stripped_dedent(func.__doc__))
          Slightly smarter dedent which ignores a string's opening indent.
          Strip the supplied string `s`. Pull off the leading line.
          Dedent the rest. Put back the leading line.
  '''
  s = s.strip()
  lines = s.split('\n')
  if not lines:
    return ''
  line1 = lines.pop(0)
  if not lines:
    return line1
  adjusted = dedent('\n'.join(lines))
  return line1 + '\n' + adjusted

# pylint: disable=redefined-outer-name
def strip_prefix_n(s, prefix, n=None):
  ''' Strip a leading `prefix` and numeric value `n` from the start of a
      string.  Return the remaining string, or the original string if the
      prefix or numeric value do not match.

      Parameters:
      * `s`: the string to strip
      * `prefix`: the prefix string which must appear at the start of `s`
      * `n`: optional integer value;
        if omitted any value will be accepted, otherise the numeric
        part must match `n`

      Examples:

         >>> strip_prefix_n('s03e01--', 's', 3)
         'e01--'
         >>> strip_prefix_n('s03e01--', 's', 4)
         's03e01--'
         >>> strip_prefix_n('s03e01--', 's')
         'e01--'
  '''
  s0 = s
  if prefix:
    s = cutprefix(s, prefix)
    if s is s0:
      # no match, return unchanged
      return s0
  else:
    s = s0
  if not s or not s[0].isdigit():
    # no following digits, return unchanged
    return s0
  if n is None:
    # strip all following digits
    s = s.lstrip(digits)
  else:
    # evaluate the numeric part
    s = s.lstrip('0')  # pylint: disable=no-member
    if not s or not s[0].isdigit():
      # all zeroes, leading value is 0
      sn = 0
      pos = 0
    else:
      pos = 1
      slen = len(s)
      while pos < slen and s[pos].isdigit():
        pos += 1
      sn = int(s[:pos])
    if sn != n:
      # wrong numeric value
      return s0
    s = s[pos:]
  return s

# pylint: disable=redefined-outer-name
def get_nonwhite(s, offset=0):
  ''' Scan the string `s` for characters not in `string.whitespace`
      starting at `offset` (default `0`).
      Return `(match,new_offset)`.
  '''
  return get_other_chars(s, offset=offset, stopchars=whitespace)

# pylint: disable=redefined-outer-name
def get_decimal(s, offset=0):
  ''' Scan the string `s` for decimal characters starting at `offset` (default `0`).
      Return `(dec_string,new_offset)`.
  '''
  return get_chars(s, offset, digits)

# pylint: disable=redefined-outer-name
def get_decimal_value(s, offset=0):
  ''' Scan the string `s` for a decimal value starting at `offset` (default `0`).
      Return `(value,new_offset)`.
  '''
  value_s, offset = get_decimal(s, offset)
  if not value_s:
    raise ValueError("expected decimal value")
  return int(value_s), offset

# pylint: disable=redefined-outer-name
def get_hexadecimal(s, offset=0):
  ''' Scan the string `s` for hexadecimal characters starting at `offset` (default `0`).
      Return `(hex_string,new_offset)`.
  '''
  return get_chars(s, offset, '0123456789abcdefABCDEF')

# pylint: disable=redefined-outer-name
def get_hexadecimal_value(s, offset=0):
  ''' Scan the string `s` for a hexadecimal value starting at `offset` (default `0`).
      Return `(value,new_offset)`.
  '''
  value_s, offset = get_hexadecimal(s, offset)
  if not value_s:
    raise ValueError("expected hexadecimal value")
  return int('0x' + value_s), offset

# pylint: disable=redefined-outer-name
def get_decimal_or_float_value(s, offset=0):
  ''' Fetch a decimal or basic float (nnn.nnn) value
      from the str `s` at `offset` (default `0`).
      Return `(value,new_offset)`.
  '''
  int_part, offset = get_decimal(s, offset)
  if not int_part:
    raise ValueError("expected decimal or basic float value")
  if offset == len(s) or s[offset] != '.':
    return int(int_part), offset
  sub_part, offset = get_decimal(s, offset + 1)
  return float('.'.join((int_part, sub_part))), offset

def get_identifier(
    s, offset=0, alpha=ascii_letters, number=digits, extras='_'
):
  ''' Scan the string `s` for an identifier (by default an ASCII
      letter or underscore followed by letters, digits or underscores)
      starting at `offset` (default 0).
      Return `(match,new_offset)`.

      *Note*: the empty string and an unchanged offset will be returned if
      there is no leading letter/underscore.

      Parameters:
      * `s`: the string to scan
      * `offset`: the starting offset, default `0`.
      * `alpha`: the characters considered alphabetic,
        default `string.ascii_letters`.
      * `number`: the characters considered numeric,
        default `string.digits`.
      * `extras`: extra characters considered part of an identifier,
        default `'_'`.
  '''
  if offset >= len(s):
    return '', offset
  ch = s[offset]
  if ch not in alpha and ch not in extras:
    return '', offset
  idtail, offset = get_chars(s, offset + 1, alpha + number + extras)
  return ch + idtail, offset

# pylint: disable=redefined-outer-name
def is_identifier(s, offset=0, **kw):
  ''' Test if the string `s` is an identifier
      from position `offset` (default `0`) onward.
  '''
  s2, offset2 = get_identifier(s, offset=offset, **kw)
  return s2 and offset2 == len(s)

# pylint: disable=redefined-outer-name
def get_uc_identifier(s, offset=0, number=digits, extras='_'):
  ''' Scan the string `s` for an identifier as for `get_identifier`,
      but require the letters to be uppercase.
  '''
  return get_identifier(
      s, offset=offset, alpha=ascii_uppercase, number=number, extras=extras
  )

# pylint: disable=redefined-outer-name
def get_dotted_identifier(s, offset=0, **kw):
  ''' Scan the string `s` for a dotted identifier (by default an
      ASCII letter or underscore followed by letters, digits or
      underscores) with optional trailing dot and another dotted
      identifier, starting at `offset` (default `0`).
      Return `(match,new_offset)`.

      Note: the empty string and an unchanged offset will be returned if
      there is no leading letter/underscore.

      Keyword arguments are passed to `get_identifier`
      (used for each component of the dotted identifier).
  '''
  offset0 = offset
  _, offset = get_identifier(s, offset=offset, **kw)
  if _:
    while offset < len(s) - 1 and s[offset] == '.':
      _, offset2 = get_identifier(s, offset=offset + 1, **kw)
      if not _:
        break
      offset = offset2
  return s[offset0:offset], offset

# pylint: disable=redefined-outer-name
def is_dotted_identifier(s, offset=0, **kw):
  ''' Test if the string `s` is an identifier from position `offset` onward.
  '''
  s2, offset2 = get_dotted_identifier(s, offset=offset, **kw)
  return len(s2) > 0 and offset2 == len(s)

# pylint: disable=redefined-outer-name
def get_other_chars(s, offset=0, stopchars=None):
  ''' Scan the string `s` for characters not in `stopchars` starting
      at `offset` (default `0`).
      Return `(match,new_offset)`.
  '''
  ooffset = offset
  while offset < len(s) and s[offset] not in stopchars:
    offset += 1
  return s[ooffset:offset], offset

# default character map for \c notation
SLOSH_CHARMAP = {
    'a': '\a',
    'b': '\b',
    'f': '\f',
    'n': '\n',
    'r': '\r',
    't': '\t',
    'v': '\v',
}

def slosh_mapper(c, charmap=None):
  ''' Return a string to replace backslash-`c`, or `None`.
  '''
  if charmap is None:
    charmap = SLOSH_CHARMAP
  return charmap.get(c)

# pylint: disable=too-many-arguments,too-many-locals,too-many-branches
# pylint: disable=too-many-statements,too-many-arguments
def get_sloshed_text(
    s, delim, offset=0, slosh='\\', mapper=slosh_mapper, specials=None
):
  ''' Collect slosh escaped text from the string `s` from position
      `offset` (default `0`) and return the decoded unicode string and
      the offset of the completed parse.

      Parameters:
      * `delim`: end of string delimiter, such as a single or double quote.
      * `offset`: starting offset within `s`, default `0`.
      * `slosh`: escape character, default a slosh ('\\').
      * `mapper`: a mapping function which accepts a single character
        and returns a replacement string or `None`; this is used the
        replace things such as '\\t' or '\\n'. The default is the
        `slosh_mapper` function, whose default mapping is `SLOSH_CHARMAP`.
      * `specials`: a mapping of other special character sequences and parse
        functions for gathering them up. When one of the special
        character sequences is found in the string, the parse
        function is called to parse at that point.
        The parse functions accept
        `s` and the offset of the special character. They return
        the decoded string and the offset past the parse.

      The escape character `slosh` introduces an encoding of some
      replacement text whose value depends on the following character.
      If the following character is:
      * the escape character `slosh`, insert the escape character.
      * the string delimiter `delim`, insert the delimiter.
      * the character 'x', insert the character with code from the following
        2 hexadecimal digits.
      * the character 'u', insert the character with code from the following
        4 hexadecimal digits.
      * the character 'U', insert the character with code from the following
        8 hexadecimal digits.
      * a character from the keys of `mapper`
  '''
  if specials is not None:
    # gather up starting character of special keys and a list of
    # keys in reverse order of length
    special_starts = set()
    special_seqs = []
    for special in specials.keys():
      if not special:
        raise ValueError(
            'empty strings may not be used as keys for specials: %r' %
            (specials,)
        )
      special_starts.add(special[0])
      special_seqs.append(special)
    special_starts = u''.join(special_starts)
    special_seqs = sorted(special_seqs, key=lambda s: -len(s))
  chunks = []
  slen = len(s)
  while True:
    if offset >= slen:
      if delim is not None:
        raise ValueError("missing delimiter %r at offset %d" % (delim, offset))
      break
    offset0 = offset
    c = s[offset]
    offset += 1
    if delim is not None and c == delim:
      # delimiter; end text
      break
    if c == slosh:
      # \something
      if offset >= slen:
        raise ValueError('incomplete slosh escape at offset %d' % (offset0,))
      offset1 = offset
      c = s[offset]
      offset += 1
      if c == slosh or (delim is not None and c == delim):
        chunks.append(c)
        continue
      if c == 'x':
        # \xhh
        if slen - offset < 2:
          raise ValueError(
              'short hexcode for %sxhh at offset %d' % (slosh, offset0)
          )
        hh = s[offset:offset + 2]
        offset += 2
        chunks.append(chr(int(hh, 16)))
        continue
      if c == 'u':
        # \uhhhh
        if slen - offset < 4:
          raise ValueError(
              'short hexcode for %suhhhh at offset %d' % (slosh, offset0)
          )
        hh = s[offset:offset + 4]
        offset += 4
        chunks.append(chr(int(hh, 16)))
        continue
      if c == 'U':
        # \Uhhhhhhhh
        if slen - offset < 8:
          raise ValueError(
              'short hexcode for %sUhhhhhhhh at offset %d' % (slosh, offset0)
          )
        hh = s[offset:offset + 8]
        offset += 8
        chunks.append(chr(int(hh, 16)))
        continue
      chunk = mapper(c)
      if chunk is not None:
        # supplied \X mapping
        chunks.append(chunk)
        continue
      # check for escaped special syntax
      if specials is not None and c in special_starts:
        # test sequence prefixes from longest to shortest
        chunk = None
        for seq in special_seqs:
          if s.startswith(seq, offset1):
            # special sequence
            chunk = c
            break
        if chunk is not None:
          chunks.append(chunk)
          continue
      raise ValueError(
          'unrecognised %s%s escape at offset %d' % (slosh, c, offset0)
      )
    if specials is not None and c in special_starts:
      # test sequence prefixes from longest to shortest
      chunk = None
      for seq in special_seqs:
        if s.startswith(seq, offset0):
          # special sequence
          chunk, offset = specials[seq](s, offset0)
          if offset < offset0 + 1:
            raise ValueError(
                "special parser for %r at offset %d moved offset backwards" %
                (c, offset0)
            )
          break
      if chunk is not None:
        chunks.append(chunk)
        continue
      chunks.append(c)
      continue
    while offset < slen:
      c = s[offset]
      if (c == slosh or (delim is not None and c == delim)
          or (specials is not None and c in special_starts)):
        break
      offset += 1
    chunks.append(s[offset0:offset])
  return u''.join(ustr(chunk) for chunk in chunks), offset

# pylint: disable=redefined-outer-name
def get_envvar(s, offset=0, environ=None, default=None, specials=None):
  ''' Parse a simple environment variable reference to $varname or
      $x where "x" is a special character.

      Parameters:
      * `s`: the string with the variable reference
      * `offset`: the starting point for the reference
      * `default`: default value for missing environment variables;
         if `None` (the default) a `ValueError` is raised
      * `environ`: the environment mapping, default `os.environ`
      * `specials`: the mapping of special single character variables
  '''
  if environ is None:
    environ = os.environ
  offset0 = offset
  if not s.startswith('$', offset):
    raise ValueError("no leading '$' at offset %d: %r" % (offset, s))
  offset += 1
  if offset >= len(s):
    raise ValueError(
        "short string, nothing after '$' at offset %d" % (offset,)
    )
  identifier, offset = get_identifier(s, offset)
  if identifier:
    value = environ.get(identifier, default)
    if value is None:
      raise ValueError(
          "unknown envvar name $%s, offset %d: %r" % (identifier, offset0, s)
      )
    return value, offset
  c = s[offset]
  offset += 1
  if specials is not None and c in specials:
    return specials[c], offset
  raise ValueError("unsupported special variable $%s" % (c,))

# pylint: disable=too-many-arguments
def get_qstr(
    s, offset=0, q='"', environ=None, default=None, env_specials=None
):
  ''' Get quoted text with slosh escapes and optional environment substitution.

      Parameters:
      * `s`: the string containg the quoted text.
      * `offset`: the starting point, default `0`.
      * `q`: the quote character, default `'"'`. If `q` is `None`,
        do not expect the string to be delimited by quote marks.
      * `environ`: if not `None`, also parse and expand `$`*envvar* references.
      * `default`: passed to `get_envvar`
  '''
  if environ is None and default is not None:
    raise ValueError(
        "environ is None but default is not None (%r)" % (default,)
    )
  if q is None:
    delim = None
  else:
    if offset >= len(s):
      raise ValueError("short string, no opening quote")
    delim = s[offset]
    offset += 1
    if delim != q:
      raise ValueError("expected opening quote %r, found %r" % (
          q,
          delim,
      ))
  if environ is None:
    return get_sloshed_text(s, delim, offset)
  getvar = partial(
      get_envvar, environ=environ, default=default, specials=env_specials
  )
  return get_sloshed_text(s, delim, offset, specials={'$': getvar})

# pylint: disable=redefined-outer-name
def get_qstr_or_identifier(s, offset):
  ''' Parse a double quoted string or an identifier.
  '''
  if s.startswith('"', offset):
    return get_qstr(s, offset, q='"')
  return get_identifier(s, offset)

# pylint: disable=redefined-outer-name
def get_delimited(s, offset, delim):
  ''' Collect text from the string `s` from position `offset` up
      to the first occurence of delimiter `delim`; return the text
      excluding the delimiter and the offset after the delimiter.
  '''
  pos = s.find(delim, offset)
  if pos < offset:
    raise ValueError(
        "delimiter %r not found after offset %d" % (delim, offset)
    )
  return s[offset:pos], pos + len(delim)

# pylint: disable=redefined-outer-name
def get_tokens(s, offset, getters):
  ''' Parse the string `s` from position `offset` using the supplied
      tokeniser functions `getters`.
      Return the list of tokens matched and the final offset.

      Parameters:
      * `s`: the string to parse.
      * `offset`: the starting position for the parse.
      * `getters`: an iterable of tokeniser specifications.

      Each tokeniser specification `getter` is either:
      * a callable expecting `(s,offset)` and returning `(token,new_offset)`
      * a literal string, to be matched exactly
      * a `tuple` or `list` with values `(func,args,kwargs)`;
        call `func(s,offset,*args,**kwargs)`
      * an object with a `.match` method such as a regex;
        call `getter.match(s,offset)` and return a match object with
        a `.end()` method returning the offset of the end of the match
  '''
  tokens = []
  # pylint: disable=cell-var-from-loop
  for getter in getters:
    args = ()
    kwargs = {}
    if callable(getter):
      func = getter
    elif isinstance(getter, StringTypes):

      # pylint: disable=redefined-outer-name
      def func(s, offset):
        ''' Wrapper for a literal string: require the string to be
            present at the current offset.
        '''
        if s.startswith(getter, offset):
          return getter, offset + len(getter)
        raise ValueError("string %r not found at offset %d" % (getter, offset))
    elif isinstance(getter, (tuple, list)):
      func, args, kwargs = getter
    elif hasattr(getter, 'match'):

      # pylint: disable=redefined-outer-name
      def func(s, offset):
        ''' Wrapper for a getter with a .match method, such as a regular
            expression.
        '''
        m = getter.match(s, offset)
        if m:
          return m, m.end()
        raise ValueError("no match for %s at offset %d" % (getter, offset))
    else:
      raise ValueError("unsupported getter: %r" % (getter,))
    token, offset = func(s, offset, *args, **kwargs)
    tokens.append(token)
  return tokens, offset

# pylint: disable=redefined-outer-name
def match_tokens(s, offset, getters):
  ''' Wrapper for `get_tokens` which catches `ValueError` exceptions
      and returns `(None,offset)`.
  '''
  try:
    tokens, offset2 = get_tokens(s, offset, getters)
  except ValueError:
    return None, offset
  else:
    return tokens, offset2

def isUC_(s):
  ''' Check that a string matches the regular expression `^[A-Z][A-Z_0-9]*$`.
  '''
  if s.isalpha() and s.isupper():
    return True
  if not s:
    return False
  if not s[0].isupper():
    return False
  for c in s[1:]:
    if c != '_' and not c.isupper() and not c.isdigit():
      return False
  return True

def parseUC_sAttr(attr):
  ''' Take an attribute name `attr` and return `(key,is_plural)`.

      Examples:
      * `'FOO'` returns `('FOO',False)`.
      * `'FOOs'` or `'FOOes'` returns `('FOO',True)`.
      Otherwise return `(None,False)`.
  '''
  if len(attr) > 1:
    if attr[-1] == 's':
      if attr[-2] == 'e':
        k = attr[:-2]
        if isUC_(k):
          return k, True
      else:
        k = attr[:-1]
        if isUC_(k):
          return k, True
  if isUC_(attr):
    return attr, False
  return None, False

def as_lines(chunks, partials=None):
  ''' Generator yielding complete lines from arbitrary pieces of text from
      the iterable of `str` `chunks`.

      After completion, any remaining newline-free chunks remain
      in the partials list; they will be unavailable to the caller
      unless the list is presupplied.
  '''
  if partials is None:
    partials = []
  if any(['\n' in p for p in partials]):
    raise ValueError("newline in partials: %r" % (partials,))
  for chunk in chunks:
    pos = 0
    nl_pos = chunk.find('\n', pos)
    while nl_pos >= pos:
      partials.append(chunk[pos:nl_pos + 1])
      yield ''.join(partials)
      partials[:] = ()
      pos = nl_pos + 1
      nl_pos = chunk.find('\n', pos)
    if pos < len(chunk):
      partials.append(chunk[pos:])

# pylint: disable=redefined-outer-name
def cutprefix(s, prefix):
  ''' Strip a `prefix` from the front of `s`.
      Return the suffix if `s.startswith(prefix)`, else `s`.

      Example:

          >>> abc_def = 'abc.def'
          >>> cutprefix(abc_def, 'abc.')
          'def'
          >>> cutprefix(abc_def, 'zzz.')
          'abc.def'
          >>> cutprefix(abc_def, '.zzz') is abc_def
          True
  '''
  if prefix and s.startswith(prefix):
    return s[len(prefix):]
  return s

# pylint: disable=redefined-outer-name
def cutsuffix(s, suffix):
  ''' Strip a `suffix` from the end of `s`.
      Return the prefix if `s.endswith(suffix)`, else `s`.

      Example:

          >>> abc_def = 'abc.def'
          >>> cutsuffix(abc_def, '.def')
          'abc'
          >>> cutsuffix(abc_def, '.zzz')
          'abc.def'
          >>> cutsuffix(abc_def, '.zzz') is abc_def
          True
  '''
  if suffix and s.endswith(suffix):
    return s[:-len(suffix)]
  return s

def common_prefix(*strs):
  ''' Return the common prefix of the strings `strs`.

      Examples:

          >>> common_prefix('abc', 'def')
          ''
          >>> common_prefix('abc', 'abd')
          'ab'
          >>> common_prefix('abc', 'abcdef')
          'abc'
          >>> common_prefix('abc', 'abcdef', 'abz')
          'ab'
          >>> # contrast with cs.fileutils.common_path_prefix
          >>> common_prefix('abc/def', 'abc/def1', 'abc/def2')
          'abc/def'
  '''
  return strs[0][:common_prefix_length(*strs)]

def common_suffix(*strs):
  ''' Return the common suffix of the strings `strs`.
  '''
  length = common_suffix_length(*strs)
  if not length:
    # catch 0 length suffix specially, because -0 == 0
    return ''
  return strs[0][-length:]

# pylint: disable=redefined-outer-name,unsubscriptable-object
def cropped(
    s: str, max_length: int = 32, roffset: int = 1, ellipsis: str = '...'
):
  ''' If the length of `s` exceeds `max_length` (default `32`),
      replace enough of the tail with `ellipsis`
      and the last `roffset` (default `1`) characters of `s`
      to fit in `max_length` characters.
  '''
  if len(s) > max_length:
    if roffset > 0:
      s = s[:max_length - len(ellipsis) - roffset] + ellipsis + s[-roffset:]
    else:
      s = s[:max_length - len(ellipsis)] + ellipsis
  return s

def cropped_repr(o, roffset=1, max_length=32, inner_max_length=None):
  ''' Compute a cropped `repr()` of `o`.

      Parameters:
      * `o`: the object to represent
      * `max_length`: the maximum length of the representation, default `32`
      * `inner_max_length`: the maximum length of the representations
        of members of `o`, default `max_length//2`
      * `roffset`: the number of trailing characters to preserve, default `1`
  '''
  if inner_max_length is None:
    inner_max_length = max_length // 2
  if isinstance(o, (tuple, list)):
    left = '(' if isinstance(o, tuple) else '['
    right = (',)' if len(o) == 1 else ')') if isinstance(o, tuple) else ']'
    o_repr = left + ','.join(
        map(
            lambda m:
            cropped_repr(m, max_length=inner_max_length, roffset=roffset), o
        )
    ) + right
  elif isinstance(o, dict):
    o_repr = '{' + ','.join(
        map(
            lambda kv: cropped_repr(
                kv[0], max_length=inner_max_length, roffset=roffset
            ) + ':' +
            cropped_repr(kv[1], max_length=inner_max_length, roffset=roffset),
            o.items()
        )
    ) + '}'
  else:
    o_repr = repr(o)
  return cropped(o_repr, max_length=max_length, roffset=roffset)

# pylint: disable=redefined-outer-name
def get_ini_clausename(s, offset=0):
  ''' Parse a `[`*clausename*`]` string from `s` at `offset` (default `0`).
      Return `(clausename,new_offset)`.
  '''
  if not s.startswith('[', offset):
    raise ValueError("missing opening '[' at position %d" % (offset,))
  offset = skipwhite(s, offset + 1)
  clausename, offset = get_qstr_or_identifier(s, offset)
  if not clausename:
    raise ValueError(
        "missing clausename identifier at position %d" % (offset,)
    )
  offset = skipwhite(s, offset)
  if not s.startswith(']', offset):
    raise ValueError("missing closing ']' at position %d" % (offset,))
  return clausename, offset + 1

# pylint: disable=redefined-outer-name
def get_ini_clause_entryname(s, offset=0):
  ''' Parse a `[`*clausename*`]`*entryname* string
      from `s` at `offset` (default `0`).
      Return `(clausename,entryname,new_offset)`.
  '''
  clausename, offset = get_ini_clausename(s, offset=offset)
  offset = skipwhite(s, offset)
  entryname, offset = get_qstr_or_identifier(s, offset)
  if not entryname:
    raise ValueError("missing entryname identifier at position %d" % (offset,))
  return clausename, entryname, offset

def camelcase(snakecased, first_letter_only=False):
  ''' Convert a snake cased string `snakecased` into camel case.

      Parameters:
      * `snakecased`: the snake case string to convert
      * `first_letter_only`: optional flag (default `False`);
        if true then just ensure that the first character of a word
        is uppercased, otherwise use `str.title`

      Example:

          >>> camelcase('abc_def')
          'abcDef'
          >>> camelcase('ABc_def')
          'abcDef'
          >>> camelcase('abc_dEf')
          'abcDef'
          >>> camelcase('abc_dEf', first_letter_only=True)
          'abcDEf'
  '''
  words = snakecased.split('_')
  for i, word in enumerate(words):
    if not word:
      continue
    if first_letter_only:
      word = word[0].upper() + word[1:]
    else:
      word = word.title()
    if i == 0:
      word = word[0].lower() + word[1:]
    words[i] = word
  return ''.join(words)

def snakecase(camelcased):
  ''' Convert a camel cased string `camelcased` into snake case.

      Parameters:
      * `cameelcased`: the cameel case string to convert
      * `first_letter_only`: optional flag (default `False`);
        if true then just ensure that the first character of a word
        is uppercased, otherwise use `str.title`

      Example:

          >>> snakecase('abcDef')
          'abc_def'
          >>> snakecase('abcDEf')
          'abc_def'
          >>> snakecase('AbcDef')
          'abc_def'
  '''
  strs = []
  was_lower = False
  for i, c in enumerate(camelcased):
    if c.isupper():
      c = c.lower()
      if was_lower:
        # boundary
        was_lower = False
        strs.append('_')
    else:
      was_lower = True
    strs.append(c)
  return ''.join(strs)

# pylint: disable=redefined-outer-name
def format_escape(s):
  ''' Escape `{}` characters in a string to protect them from `str.format`.
  '''
  return s.replace('{', '{{').replace('}', '}}')

class FormatAsError(LookupError):
  ''' Subclass of `LookupError` for use by `format_as`.
  '''

  DEFAULT_SEPARATOR = '; '

  def __init__(self, key, format_s, format_mapping, error_sep=None):
    if error_sep is None:
      error_sep = self.DEFAULT_SEPARATOR
    LookupError.__init__(self, key)
    self.args = (key, format_s, format_mapping, error_sep)

  def __str__(self):
    key, format_s, format_mapping, error_sep = self.args
    return error_sep.join(
        (
            "format fails, missing key: %s" % (key,),
            "format string was: %r" % (format_s,),
            "available keys: %s" % (' '.join(sorted(format_mapping.keys()))),
        )
    )

@decorator
def format_recover(method):
  ''' Decorator for `__format__` methods which replaces failed formats
      with `{self:format_spec}`.
  '''

  def format_recovered(self, format_spec):
    try:
      return method(self, format_spec)
    except ValueError as e:
      warning(
          "@format_recover: %s.%s(%r): %s, falling back via %r",
          type(self).__name__, funcname(method), format_spec, e,
          "f'{{{self}:{format_spec}}}'"
      )
      return f'{{{self}:{format_spec}}}'

  return format_recovered

@typechecked
@fmtdoc
def format_as(
    format_s: str,
    format_mapping,
    formatter=None,
    error_sep=None,
    strict=None,
):
  ''' Format the string `format_s` using `Formatter.vformat`,
      return the formatted result.
      This is a wrapper for `str.format_map`
      which raises a more informative `FormatAsError` exception on failure.

      Parameters:
      * `format_s`: the format string to use as the template
      * `format_mapping`: the mapping of available replacement fields
      * `formatter`: an optional `string.Formatter`-like instance
        with a `.vformat(format_string,args,kwargs)` method,
        usually a subclass of `string.Formatter`;
        if not specified then `FormatableFormatter` is used
      * `error_sep`: optional separator for the multipart error message,
        default from `FormatAsError.DEFAULT_SEPARATOR`:
        `'{FormatAsError.DEFAULT_SEPARATOR}'`
      * `strict`: optional flag (default `False`)
        indicating that an unresolveable field should raise a
        `KeyError` instead of inserting a placeholder
  '''
  if formatter is None:
    formatter = FormatableFormatter(format_mapping)
  if strict is None:
    strict = formatter.format_mode.strict
  with formatter.format_mode(strict=strict):
    try:
      formatted = formatter.vformat(format_s, (), format_mapping)
    except KeyError as e:
      # pylint: disable=raise-missing-from
      raise FormatAsError(
          e.args[0], format_s, format_mapping, error_sep=error_sep
      )
    return formatted

_format_as = format_as  # for reuse in the format_as method below

def format_attribute(method):
  ''' Mark a method as available as a format method.
      Requires the enclosing class to be decorated with `@has_format_attributes`.

      For example,
      the `FormatableMixin.json` method is defined like this:

          @format_attribute
          def json(self):
              return self.FORMAT_JSON_ENCODER.encode(self)

      which allows a `FormatableMixin` subclass instance
      to be used in a format string like this:

          {instance:json}

      to insert a JSON transcription of the instance.

      It is recommended that methods marked with `@format_attribute`
      have no side effects and do not modify state,
      as they are intended for use in ad hoc format strings
      supplied by an end user.
  '''
  method.is_format_attribute = True
  return method

def has_format_attributes(cls):
  ''' Class decorator to walk this class for direct methods
      marked as for use in format strings
      and to include them in `cls.format_attributes()`.
  '''
  attributes = cls.get_format_attributes()
  for attr in dir(cls):
    try:
      attribute = getattr(cls, attr)
    except AttributeError:
      pass
    else:
      if getattr(attribute, 'is_format_attribute', False):
        attributes[attr] = attribute
  return cls

class FormatableFormatter(Formatter):
  ''' A `string.Formatter` subclass interacting with objects
      which inherit from `FormatableMixin`.
  '''

  FORMAT_RE_LITERAL_TEXT = re.compile(r'([^{]+|{{)*')
  FORMAT_RE_IDENTIFIER_s = r'[a-z_][a-z_0-9]*'
  FORMAT_RE_ARG_NAME_s = rf'({FORMAT_RE_IDENTIFIER_s}|\d+(\.\d+)?[a-z]+)'
  FORMAT_RE_ATTRIBUTE_NAME_s = rf'\.{FORMAT_RE_IDENTIFIER_s}'
  FORMAT_RE_ELEMENT_INDEX_s = r'[^]]*'
  FORMAT_RE_FIELD_EXPR_s = (
      rf'{FORMAT_RE_ARG_NAME_s}'
      rf'({FORMAT_RE_ATTRIBUTE_NAME_s}|\[{FORMAT_RE_ELEMENT_INDEX_s}\]'
      rf')*'
  )
  FORMAT_RE_FIELD_EXPR = re.compile(FORMAT_RE_FIELD_EXPR_s, re.I)
  FORMAT_RE_FIELD = re.compile(
      (
          r'{' + rf'(?P<arg_name>{FORMAT_RE_FIELD_EXPR_s})?' +
          r'(!(?P<conversion>[^:}]*))?' + r'(:(?P<format_spec>[^}]*))?' + r'}'
      ), re.I
  )

  @property
  def format_mode(self):
    ''' Thread local state object.

        Attributes:
        * `strict`: initially `False`; raise a `KeyError` for
          unresolveable field names
    '''
    try:
      lock = self.__dict__['_lock']
    except KeyError:
      lock = self.__dict__['_lock'] = Lock()
    with lock:
      try:
        mode = self.__dict__['format_mode']
      except KeyError:
        # pylint: disable=import-outside-toplevel
        from cs.threads import State as ThreadState
        mode = self.__dict__['format_mode'] = ThreadState(strict=False)
    return mode

  if False:  # pylint: disable=using-constant-test

    @classmethod
    @typechecked
    def parse(cls, format_string: str):
      ''' Parse a format string after the fashion of `Formatter.parse`,
          yielding `(literal,arg_name,format_spec,conversion)` tuples.

          Unlike `Formatter.parse`,
          this does not validate the `conversion` part preemptively,
          supporting extended values for use with the `convert_field` method.
      '''
      offset = 0
      while offset < len(format_string):
        m_literal = cls.FORMAT_RE_LITERAL_TEXT.match(format_string, offset)
        literal = m_literal.group()
        offset = m_literal.end()
        if offset == len(format_string):
          # nothing after the literal text
          if literal:
            yield literal, None, None, None
          return
        m_field = cls.FORMAT_RE_FIELD.match(format_string, offset)
        if not m_field:
          raise ValueError(
              "expected a field at offset %d: found %r" %
              (offset, format_string[offset:])
          )
        yield (
            literal,
            m_field.group('arg_name'),
            m_field.group('format_spec') or '',
            m_field.group('conversion'),
        )
        offset = m_field.end()

  @staticmethod
  def get_arg_name(field_name):
    ''' Default initial arg_name is an identifier.

        Returns `(prefix,offset)`, and `('',0)` if there is no arg_name.
    '''
    return get_identifier(field_name)

  # pylint: disable=arguments-differ
  @pfx_method
  def get_field(self, field_name, a, kw):
    ''' Get the object referenced by the field text `field_name`.
        Raises `KeyError` for an unknown `field_name`.
    '''
    assert not a
    with Pfx("field_name=%r: kw=%r", field_name, kw):
      arg_name, offset = self.get_arg_name(field_name)
      arg_value, _ = self.get_value(arg_name, a, kw)
      # resolve the rest of the field
      subfield = self.get_subfield(arg_value, field_name[offset:])
      return subfield, field_name

  @staticmethod
  def get_subfield(value, subfield_text: str):
    ''' Resolve `value` against `subfield_text`,
        the remaining field text after the term which resolved to `value`.

        For example, a format `{name.blah[0]}`
        has the field text `name.blah[0]`.
        A `get_field` implementation might initially
        resolve `name` to some value,
        leaving `.blah[0]` as the `subfield_text`.
        This method supports taking that value
        and resolving it against the remaining text `.blah[0]`.

        For generality, if `subfield_text` is the empty string
        `value` is returned unchanged.
    '''
    if subfield_text == '':
      return value
    if subfield_text[0] in '.[':
      subfield_fmt = f'{{value{subfield_text}}}'
      subfield_map = {'value': value}
      with Pfx("%r.format_map(%r)", subfield_fmt, subfield_map):
        value = subfield_fmt.format_map(subfield_map)
    else:
      # use the subfield_text after the colon
      fmt = f'{{value:{subfield_text}}}'
      value = fmt.format(value=value)
    return value

  # pylint: disable=arguments-differ
  @pfx_method
  def get_value(self, arg_name, a, kw):
    ''' Get the object with index `arg_name`.

        This default implementation returns `(kw[arg_name],arg_name)`.
    '''
    assert not a
    return kw[arg_name], arg_name

  @classmethod
  def get_format_subspecs(cls, format_spec):
    ''' Parse a `format_spec` as a sequence of colon separated components,
        return a list of the components.
    '''
    subspecs = []
    offset = 0
    while offset < len(format_spec):
      if format_spec.startswith(':', offset):
        subspec = ''
        offset += 1
      else:
        m_subspec = cls.FORMAT_RE_FIELD_EXPR.match(format_spec, offset)
        if m_subspec:
          subspec = m_subspec.group()
        else:
          warning(
              "unrecognised subspec at %d: %r, falling back to split", offset,
              format_spec[offset:]
          )
          subspec, *_ = format_spec[offset:].split(':', 1)
        offset += len(subspec)
      subspecs.append(subspec)
    return subspecs

  @classmethod
  @pfx_method
  @typechecked
  def format_field(cls, value, format_spec: str):
    ''' Format a value using `value.format_format_field`,
        returning an `FStr`
        (a `str` subclass with additional `format_spec` features).

        We actually recognise colon separated chains of formats
        and apply each format to the previously converted value.
        The final result is promoted to an `FStr` before return.
    '''
    # parse the format_spec into multiple subspecs
    format_subspecs = cls.get_format_subspecs(format_spec) or []
    while format_subspecs:
      format_subspec = format_subspecs.pop(0)
      with Pfx("subspec %r", format_subspec):
        assert isinstance(format_subspec, str)
        assert len(format_subspec) > 0
        with Pfx("value=%r, format_subspec=%r", value, format_subspec):
          # promote bare str to FStr
          if type(value) is str:  # pylint: disable=unidiomatic-typecheck
            value = FStr(value)
          if format_subspec[0].isalpha():
            try:
              value.convert_via_method_or_attr
            except AttributeError:
              # promote to something with convert_via_method_or_attr
              if isinstance(value, str):
                value = FStr(value)
              else:
                value = pfx_call(format, value, format_subspec)
            value, offset = value.convert_via_method_or_attr(
                value, format_subspec
            )
            if offset < len(format_subspec):
              subspec_tail = format_subspec[offset:]
              value = cls.get_subfield(value, subspec_tail)
          else:
            value = format(value, format_subspec)
    return FStr(value)

@has_format_attributes
class FormatableMixin(FormatableFormatter):  # pylint: disable=too-few-public-methods
  ''' A subclass of `FormatableFormatter` which  provides 2 features:
      - a `__format__` method which parses the `format_spec` string
        into multiple colon separated terms whose results chain
      - a `format_as` method which formats a format string using `str.format_map`
        with a suitable mapping derived from the instance
        via its `format_kwargs` method
        (whose default is to return the instance itself)

      The `format_as` method is like an inside out `str.format` or
      `object.__format__` method.

      The `str.format` method is designed for formatting a string
      from a variety of other objects supplied in the keyword arguments.

      The `object.__format__` method is for filling out a single `str.format`
      replacement field from a single object.

      By contrast, `format_as` is designed to fill out an entire format
      string from the current object.

      For example, the `cs.tagset.TagSetMixin` class
      uses `FormatableMixin` to provide a `format_as` method
      whose replacement fields are derived from the tags in the tag set.

      Subclasses wanting to provide additional `format_spec` terms
      should:
      - override `FormatableFormatter.format_field1` to implement
        terms with no colons, letting `format_field` do the split into terms
      - override `FormatableFormatter.get_format_subspecs` to implement
        the parse of `format_spec` into a sequence of terms.
        This might recognise a special additional syntax
        and quietly fall back to `super().get_format_subspecs`
        if that is not present.
  '''

  FORMAT_JSON_ENCODER = JSONEncoder(separators=(',', ':'))

  # pylint: disable=invalid-format-returned
  def __format__(self, format_spec):
    ''' Format `self` according to `format_spec`.

        This implementation calls `self.format_field`.
        As such, a `format_spec` is considered
        a sequence of colon separated terms.

        Classes wanting to implement addition format string syntaxes
        should either:
        - override `FormatableFormatter.format_field1` to implement
          terms with no colons, letting `format_field1` do the split into terms
        - override `FormatableFormatter.get_format_subspecs` to implement
          the term parse.

        The default implementation of `__format1__` just calls `super().__format__`.
        Implementations providing specialised formats
        should implement them in `__format1__`
        with fallback to `super().__format1__`.
    '''
    return self.format_field(self, format_spec)

  @classmethod
  def get_format_attributes(cls):
    ''' Return the mapping of format attributes.
    '''
    try:
      attributes = cls.__dict__['_format_attributes']
    except KeyError:
      cls._format_attributes = attributes = {}
    return attributes

  def get_format_attribute(self, attr):
    ''' Return a mapping of permitted methods to functions of an instance.
        This is used to whitelist allowed `:`*name* method formats
        to prevent scenarios like little Bobby Tables calling `delete()`.
    '''
    # this shuffle is because cls.__dict__ is a proxy, not a dict
    cls = type(self)
    attributes = cls.get_format_attributes()
    if attr in attributes:
      return getattr(self, attr)
    raise AttributeError(
        "disallowed attribute %r: not in %s._format_attributes" %
        (attr, cls.__name__)
    )

  ##@staticmethod
  def convert_field(self, value, conversion):
    ''' The default converter for fields calls `Formatter.convert_field`.
    '''
    if conversion == '':
      warning(
          "%s.convert_field(%s, conversion=%r): turned conversion into None",
          type(self).__name__, typed_str(value, use_repr=True), conversion
      )
      conversion = None
    return super().convert_field(value, conversion)

  @pfx_method
  def convert_via_method_or_attr(self, value, format_spec):
    ''' Apply a method or attribute name based conversion to `value`
        where `format_spec` starts with a method name
        applicable to `value`.
        Return `(converted,offset)`
        being the converted value and the offset after the method name.

        Note that if there is not a leading identifier on `format_spec`
        then `value` is returned unchanged with `offset=0`.

        The methods/attributes are looked up in the mapping
        returned by `.format_attributes()` which represents allowed methods
        (broadly, one should not allow methods which modify any state).

        If this returns a callable, it is called to obtain the converted value
        otherwise it is used as is.

        As a final tweak,
        if `value.get_format_attribute()` raises an `AttributeError`
        (the attribute is not an allowed attribute)
        or calling the attribute raises a `TypeError`
        (the `value` isn't suitable)
        and the `value` is not an instance of `FStr`,
        convert it to an `FStr` and try again.
        This provides the common utility methods on other types.

        The motivating example was a `PurePosixPath`,
        which does not JSON transcribe;
        this tweak supports both
        `posixpath:basename` via the pathlib stuff
        and `posixpath:json` via `FStr`
        even though a `PurePosixPath` does not subclass `FStr`.
    '''
    try:
      attr, offset = get_identifier(format_spec)
      if not attr:
        # no leading method/attribute name, return unchanged
        return value, 0
      try:
        attribute = value.get_format_attribute(attr)
      except AttributeError as e:
        raise TypeError(
            "convert_via_method_or_attr(%s,%r): %s" %
            (typed_repr(value), format_spec, e)
        ) from e
      if callable(attribute):
        converted = attribute()
      else:
        converted = attribute
      return converted, offset
    except TypeError:
      if not isinstance(value, FStr):
        with Pfx("fall back to FStr(value=%s).convert_via_method_or_attr"):
          return self.convert_via_method_or_attr(FStr(value), format_spec)
      raise

  def format_as(self, format_s, error_sep=None, strict=None, **control_kw):
    ''' Return the string `format_s` formatted using the mapping
        returned by `self.format_kwargs(**control_kw)`.

        If a class using the mixin has no `format_kwargs(**control_kw)` method
        to provide a mapping for `str.format_map`
        then the instance itself is used as the mapping.
    '''
    get_format_mapping = getattr(self, 'format_kwargs', None)
    if get_format_mapping is None:
      if control_kw:
        # pylint: disable=raise-missing-from
        raise ValueError(
            "no .format_kwargs() method, but control_kw=%r" % (control_kw,)
        )
      format_mapping = self
    else:
      format_mapping = get_format_mapping(**control_kw)  # pylint:disable=not-callable
    if strict is None:
      strict = self.format_mode.strict
    with self.format_mode(strict=strict):
      return _format_as(
          format_s,
          format_mapping,
          formatter=self,
          error_sep=error_sep,
      )

  # Utility methods for formats.
  @format_attribute
  def json(self):
    ''' The value transcribed as compact JSON.
    '''
    return self.FORMAT_JSON_ENCODER.encode(self)

@has_format_attributes
class FStr(FormatableMixin, str):
  ''' A `str` subclass with the `FormatableMixin` methods,
      particularly its `__format__` method
      which uses `str` method names as valid formats.

      It also has a bunch of utility methods which are available
      as `:`*method* in format strings.
  '''

  # str is immutable: prefill with all public class attributes
  _format_attributes = {
      attr: getattr(str, attr)
      for attr in dir(str)
      if attr[0].isalpha()
  }

  @format_attribute
  def basename(self):
    ''' Treat as a filesystem path and return the basename.
    '''
    return Path(self).name

  @format_attribute
  def dirname(self):
    ''' Treat as a filesystem path and return the dirname.
    '''
    return Path(self).parent

  @format_attribute
  def f(self):
    ''' Parse `self` as a `float`.
    '''
    return float(self)

  @format_attribute
  def i(self, base=10):
    ''' Parse `self` as an `int`.
    '''
    return int(self, base=base)

  @format_attribute
  def lc(self):
    ''' Lowercase using `lc_()`.
    '''
    return lc_(self)

  @format_attribute
  def path(self):
    ''' Convert to a native filesystem `pathlib.Path`.
    '''
    return Path(self)

  @format_attribute
  def posix_path(self):
    ''' Convert to a Posix filesystem `pathlib.Path`.
    '''
    return PurePosixPath(self)

  @format_attribute
  def windows_path(self):
    ''' Convert to a Windows filesystem `pathlib.Path`.
    '''
    return PureWindowsPath(self)

if __name__ == '__main__':
  import cs.lex_tests
  cs.lex_tests.selftest(sys.argv)