@@ 1,16 1,21 @@
#!/usr/bin/python3
# xgrep.py -- search for elements in XML files, using XPath 1.0 expressions
-# Andreas Nolda 2022-09-05
+# Andreas Nolda 2023-11-04
import sys
import argparse
import re
+
from blessings import Terminal
+
from lxml import etree
+
version = "2.11"
+
parser = argparse.ArgumentParser()
+
parser.add_argument("expr",
help="XPath 1.0 expression")
parser.add_argument("files", metavar="file", nargs="+",
@@ 46,77 51,92 @@ parser.add_argument("-r", "--regex", met
parser.add_argument("-s", "--spaces", action="store_true",
help="normalize whitespace to spaces")
parser.add_argument("-v", "--version", action="version",
- version="{0} {1}".format(parser.prog, version))
+ version=f"{parser.prog} {version}")
+
args = parser.parse_args()
+
term = Terminal(force_styling=args.force_color)
+
def insert_default_ns(expr, nsmap):
if nsmap:
- list = re.split("("
- "/|" # /
- "[a-z-]+::|" # axis::
- "[A-Za-z_][A-Za-z0-9._-]*:[A-Za-z_][A-Za-z0-9._-]*|" # namespace:element
- "\[|" # [
- "@[A-Za-z_][A-Za-z0-9._-]*|" # @attribute
- "=|" # =
- "'[^']*'|" # 'value'
- '"[^"]*"|' # "value"
- "[a-z-]+\(|" # function(
- ",|" # ,
- "\)|" # )
- "\s+and\s+|" # and
- "\s+or\s+|" # or
- "\]" # ]
- ")",
- expr)
- nslist = ["default:" + item if re.fullmatch("[A-Za-z_][A-Za-z0-9._-]*", # element without namespace
- item)
- else item
- for item in list]
- string = "".join(nslist)
+ subexprs = re.split(r"("
+ r"/|" # /
+ r"[a-z-]+::|" # axis::
+ r"[A-Za-z_][A-Za-z0-9._-]*:[A-Za-z_][A-Za-z0-9._-]*|" # namespace:element
+ r"\[|" # [
+ r"@[A-Za-z_][A-Za-z0-9._-]*|" # @attribute
+ r"=|" # =
+ r"'[^']*'|" # 'value'
+ r'"[^"]*"|' # "value"
+ r"[a-z-]+\(|" # function(
+ r",|" # ,
+ r"\)|" # )
+ r"\s+and\s+|" # and
+ r"\s+or\s+|" # or
+ r"\]" # ]
+ r")",
+ expr)
+ nssubexprs = ["default:" + subexpr
+ if re.fullmatch("[A-Za-z_][A-Za-z0-9._-]*", subexpr) # element without namespace
+ else subexpr
+ for subexpr in subexprs]
+ nsexpr = "".join(nssubexprs)
else:
- string = expr
- return string
+ nsexpr = expr
+ return nsexpr
+
def serialize_match(match):
if isinstance(match, str):
string = match
else:
string = etree.tostring(match, with_tail=False,
- encoding="utf-8",
- pretty_print=args.indent).decode()
+ encoding="unicode",
+ pretty_print=args.indent)
+
if not args.indent:
- string = re.sub("\n\s+", "\n", string)
+ string = re.sub(r"\n\s+", r"\n", string)
+
if args.abbreviate:
lines = string.splitlines()
string = lines[0]
if len(lines) > 1:
string += " " + term.bright_black("...")
+
if args.spaces:
- string = re.sub("\s+", " ", string)
+ string = re.sub(r"\s+", " ", string)
+
if string.endswith("\n"):
string = string[:-1]
return string
+
def remove_ns(match):
- string = re.sub('\s+xmlns(:[A-Za-z_][A-Za-z0-9._-]*)?="[^"]+"', '', match)
+ string = re.sub(r'\s+xmlns(:[A-Za-z_][A-Za-z0-9._-]*)?="[^"]+"', '', match)
return string
+
def print_filename(file, end):
print(term.bold(file), end=end)
+
def print_total(matches, end):
print(len(matches), end=end)
+
def print_expr(expr, end):
print(term.bold(expr), end=end)
-def print_index(int, end):
- print(term.bold("[{0}]".format(int + 1)), end=end)
+
+def print_index(i, end):
+ print(term.bold(f"[{i + 1}]"), end=end)
+
def print_line_number(match, end):
- print(term.bright_black("{0}".format(match.sourceline)), end=end)
+ print(term.bright_black(str(match.sourceline)), end=end)
+
def print_match(match, end):
if args.declare_ns:
@@ 124,6 144,7 @@ def print_match(match, end):
else:
print(remove_ns(serialize_match(match)), end=end)
+
def print_matches(matches, file):
if matches:
if args.files_with_matches:
@@ 167,6 188,7 @@ def print_matches(matches, file):
print_line_number(match, ":")
print_match(match, "\n")
+
def print_nonmatches(matches, file):
if not matches:
if args.count:
@@ 175,9 197,11 @@ def print_nonmatches(matches, file):
else:
print_filename(file, "\n")
+
def main():
n = 0
e = False
+
for file in args.files:
try:
xml_parser = etree.XMLParser(remove_blank_text=args.indent,
@@ 187,12 211,13 @@ def main():
root = tree.getroot()
# cf. https://stackoverflow.com/q/4210730:
nsmap = {key if key is not None
- else "default":value
- for key,value in root.nsmap.items()}
+ else "default": value
+ for key, value in root.nsmap.items()}
nsexpr = insert_default_ns(args.expr, nsmap)
# add regexp namespace *after* inserting default namespace
if args.regex:
nsmap[args.regex] = "http://exslt.org/regular-expressions"
+
matches = tree.xpath(nsexpr, namespaces=nsmap)
if not args.quiet:
if args.files_without_match:
@@ 214,6 239,7 @@ def main():
if not args.quiet:
print(term.bold_red(file) + ": XPath expression cannot be evaluated", file=sys.stderr)
e = True
+
if e:
exit = 2
elif n == 0:
@@ 222,5 248,6 @@ def main():
exit = 0
return exit
+
if __name__ == "__main__":
sys.exit(main())