[utils][filecheck-lint] Add filecheck-lint, a linter for FileCheck directives.

FileCheck is a versatile tool for testing compiler output regardless of syntax.
FileCheck achieves this versatility by processing only input lines in which it
detects a valid CHECK directive, and ignoring everything else.

This comes at a price: if a directive is not typed properly, it is not
processed by FileCheck, and the test code it precedes is effectively disabled.
This results in the illusion that the code is tested, while the test may have
actually never run.

This scenario is not hypothetical, see the fixes introduced in, e.g.
48cacf049f,
https://reviews.llvm.org/D139698, https://reviews.llvm.org/D139636,
https://github.com/iree-org/iree/pull/11693.

The findings corrected in the above changes originate in filecheck-lint.
In a given test file, filecheck-lint uses the edit distance between anything
that looks like a FileCheck directive and the set of locally valid directives
to detect likely misspelled directives.

The tool is not yet feature complete---e.g. it does not parse custom comment
prefixes to exclude them from reporting---but it already yields useful results,
as demonstrated above.

Differential Revision: https://reviews.llvm.org/D141508
This commit is contained in:
Benjamin Chetioui 2023-01-12 13:19:42 +00:00
parent a74bc43670
commit f702c75995
2 changed files with 329 additions and 0 deletions

View File

@ -0,0 +1,251 @@
# ===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===----------------------------------------------------------------------===##
"""A linter that detects potential typos in FileCheck directive names.
Consider a broken test foo.cpp:
// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
auto x = 42;
// NEWW: auto is a c++11 extension
// ODL-NOT: auto is a c++11 extension
We first detect the locally valid FileCheck directive prefixes by parsing the
--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.
Then we look for lines that look like directives. These are of the form 'FOO:',
usually at the beginning of a line or a comment. If any of these are a
"near-miss" for a directive name, then we suspect this is a typo and report it.
Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
"""
import itertools
import logging
import pathlib
import re
import sys
from typing import Generator, Sequence, Tuple
_distance_threshold = 3
_prefixes = {'CHECK'}
_suffixes = {'-DAG', '-COUNT', '-EMPTY', '-LABEL', '-NEXT', '-NOT', '-SAME'}
# 'NOTE' and 'TODO' are not directives, but are likely to be false positives
# if encountered and to generate noise as a result. We filter them out also to
# avoid this.
_lit_directives = {
'RUN',
'REQUIRES',
'UNSUPPORTED',
'XFAIL',
'DEFINE',
'REDEFINE',
}
# 'COM' and 'RUN' are default comment prefixes for FileCheck.
_comment_prefixes = {'COM', 'RUN'}
_ignore = _lit_directives.union(_comment_prefixes).union({'NOTE', 'TODO'})
def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args
"""Computes the edit distance between two strings.
Additions, deletions, and substitutions all count as a single operation.
"""
if not s1:
return len(s2)
if not s2:
return len(s1)
distances = range(len(s2) + 1)
for i in range(len(s1)):
new_distances = [i + 1]
for j in range(len(s2)):
cost = min(distances[j] + int(s1[i] != s2[j]), distances[j + 1] + 1,
new_distances[-1] + 1)
new_distances.append(cost)
distances = new_distances
return distances[-1]
class FileRange:
"""Stores the coordinates of a span on a single line within a file.
Attributes:
line: the line number
start_column: the (inclusive) column where the span starts
end_column: the (inclusive) column where the span ends
"""
line: int
start_column: int
end_column: int
def __init__(self, content: str, start_byte: int, end_byte: int): # pylint: disable=g-doc-args
"""Derives a span's coordinates based on a string and start/end bytes.
`start_byte` and `end_byte` are assumed to be on the same line.
"""
content_before_span = content[:start_byte]
self.line = content_before_span.count('\n') + 1
self.start_column = start_byte - content_before_span.rfind('\n')
self.end_column = self.start_column + (end_byte - start_byte - 1)
def __str__(self) -> str:
return f'{self.line}:{self.start_column}-{self.end_column}'
class Diagnostic:
"""Stores information about one typo and a suggested fix.
Attributes:
filepath: the path to the file in which the typo was found
filerange: the position at which the typo was found in the file
typo: the typo
fix: a suggested fix
"""
filepath: pathlib.Path
filerange: FileRange
typo: str
fix: str
def __init__(
self,
filepath: pathlib.Path,
filerange: FileRange,
typo: str,
fix: str # pylint: disable=redefined-outer-name
):
self.filepath = filepath
self.filerange = filerange
self.typo = typo
self.fix = fix
def __str__(self) -> str:
return f'{self.filepath}:' + str(self.filerange) + f': {self.summary()}'
def summary(self) -> str:
return (
f'Found potentially misspelled directive "{self.typo}". Did you mean '
f'"{self.fix}"?')
def find_potential_directives(
content: str,) -> Generator[Tuple[FileRange, str], None, None]:
"""Extracts all the potential FileCheck directives from a string.
What constitutes a potential directive is loosely defined---we err on the side
of capturing more strings than is necessary, rather than missing any.
Args:
content: the string in which to look for directives
Yields:
Tuples (p, d) where p is the span where the potential directive occurs
within the string and d is the potential directive.
"""
directive_pattern = re.compile(
r'(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):', re.MULTILINE)
for match in re.finditer(directive_pattern, content):
potential_directive, span = match.group(1), match.span(1)
yield (FileRange(content, span[0], span[1]), potential_directive)
# TODO(bchetioui): also parse comment prefixes to ignore.
def parse_custom_prefixes(content: str) -> Generator[str, None, None]: # pylint: disable=g-doc-args
"""Parses custom prefixes defined in the string provided.
For example, given the following file content:
RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
RUN: something_else | FileCheck %s -check-prefix 'CHECK3'
the custom prefixes are CHECK1, CHECK2, and CHECK3.
"""
param_re = r'|'.join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
for m in re.finditer(r'-check-prefix(?:es)?(?:\s+|=)({})'.format(param_re),
content):
prefixes = m.group(1)
if prefixes.startswith('\'') or prefixes.startswith('"'):
prefixes = prefixes[1:-1]
for prefix in prefixes.split(','):
yield prefix
def find_directive_typos(
content: str,
filepath: pathlib.Path,
threshold: int = 3,
) -> Generator[Diagnostic, None, None]:
"""Detects potential typos in FileCheck directives.
Args:
content: the content of the file
filepath: the path to the file to check for typos in directives
threshold: the (inclusive) maximum edit distance between a potential
directive and an actual directive, such that the potential directive is
classified as a typo
Yields:
Diagnostics, in order from the top of the file.
"""
all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
all_directives = ([
f'{prefix}{suffix}'
for prefix, suffix in itertools.product(all_prefixes, _suffixes)
] + list(_ignore) + list(all_prefixes))
def find_best_match(typo):
return min(
[(threshold + 1, typo)] + [(levenshtein(typo, d), d)
for d in all_directives
if abs(len(d) - len(typo)) <= threshold],
key=lambda tup: tup[0],
)
potential_directives = find_potential_directives(content)
for filerange, potential_directive in potential_directives:
# TODO(bchetioui): match count directives more finely. We skip directives
# starting with 'CHECK-COUNT-' for the moment as they require more complex
# logic to be handled correctly.
if any(
potential_directive.startswith(f'{prefix}-COUNT-')
for prefix in all_prefixes):
continue
# Ignoring potential typos that will not be matched later due to a too low
# threshold, in order to avoid potentially long computation times.
if len(potential_directive) > max(map(len, all_directives)) + threshold:
continue
score, best_match = find_best_match(potential_directive)
if score == 0: # This is an actual directive, ignore.
continue
elif score <= threshold and best_match not in _ignore:
yield Diagnostic(filepath, filerange, potential_directive, best_match)
def main(argv: Sequence[str]):
if len(argv) < 2:
print(f'Usage: {argv[0]} path/to/file/1 ... path/to/file/n')
exit(1)
for filepath in argv[1:]:
logging.info('Checking %s', filepath)
with open(filepath, 'rt') as f:
content = f.read()
for diagnostic in find_directive_typos(
content,
pathlib.Path(filepath),
threshold=_distance_threshold,
):
print(diagnostic)
if __name__ == '__main__':
main(sys.argv)

View File

@ -0,0 +1,78 @@
# ===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===----------------------------------------------------------------------===##
import unittest
import filecheck_lint as fcl
class TestParser(unittest.TestCase):
def test_parse_all_additional_prefixes(self):
def run(content, expected_prefixes):
prefixes = set(fcl.parse_custom_prefixes(content))
for prefix in expected_prefixes:
self.assertIn(prefix, prefixes)
for content, expected_prefixes in [
('-check-prefix=PREFIX', {'PREFIX'}),
('-check-prefix=\'PREFIX\'', {'PREFIX'}),
('-check-prefix="PREFIX"', {'PREFIX'}),
('-check-prefix PREFIX', {'PREFIX'}),
('-check-prefix PREFIX', {'PREFIX'}),
('-check-prefixes=PREFIX1,PREFIX2', {'PREFIX1', 'PREFIX2'}),
('-check-prefixes PREFIX1,PREFIX2', {'PREFIX1', 'PREFIX2'}),
(
"""-check-prefix=PREFIX1 -check-prefix PREFIX2
-check-prefixes=PREFIX3,PREFIX4 -check-prefix=PREFIX5
-check-prefixes PREFIX6,PREFIX7 -check-prefixes=PREFIX8',
""", # pylint: disable=bad-continuation
{f'PREFIX{i}' for i in range(1, 9)}),
]:
run(content, expected_prefixes)
def test_additional_prefixes_uniquely(self):
lines = ['--check-prefix=SOME-PREFIX', '--check-prefix=SOME-PREFIX']
prefixes = set(fcl.parse_custom_prefixes('\n'.join(lines)))
assert len(prefixes) == 1
class TestTypoDetection(unittest.TestCase):
def test_find_potential_directives_comment_prefix(self):
lines = ['junk; CHCK1:', 'junk// CHCK2:', 'SOME CHCK3:']
content = '\n'.join(lines)
results = list(fcl.find_potential_directives(content))
assert len(results) == 3
pos, match = results[0]
assert (pos.line == 1 and
pos.start_column == len('junk; ') + 1 and
pos.end_column == len(lines[0]) - 1)
assert match == 'CHCK1'
pos, match = results[1]
assert (pos.line == 2 and
pos.start_column == len('junk// ') + 1 and
pos.end_column == len(lines[1]) - 1)
assert match == 'CHCK2'
pos, match = results[2]
assert (pos.line == 3 and
pos.start_column == 1 and
pos.end_column == len(lines[2]) - 1)
assert match == 'SOME CHCK3'
def test_levenshtein(self):
for s1, s2, distance in [
('Levenshtein', 'Levenstin', 2), # 2 insertions
('Levenshtein', 'Levenstherin', 3), # 1 insertion, 2 deletions
('Levenshtein', 'Lenvinshtein', 2), # 1 deletion, 1 substitution
('Levenshtein', 'Levenshtein', 0), # identical strings
]:
assert fcl.levenshtein(s1, s2) == distance