src/pqc_lint/scanner.py
3.8 KB · 118 lines · python Raw
1 """Filesystem scanner."""
2
3 from __future__ import annotations
4
5 import fnmatch
6 import os
7 import time
8 from dataclasses import dataclass, field
9 from datetime import datetime, timezone
10 from typing import Iterable
11
12 from pqc_lint.findings import Finding, ScanReport
13 from pqc_lint.patterns import ALL_MATCHERS, MATCHERS_BY_LANGUAGE, PatternMatcher
14 from pqc_lint.rules import RULE_BY_ID
15
16 DEFAULT_EXCLUDES = (
17 "**/.git/**",
18 "**/node_modules/**",
19 "**/__pycache__/**",
20 "**/.venv/**",
21 "**/venv/**",
22 "**/dist/**",
23 "**/build/**",
24 "**/.pytest_cache/**",
25 "**/.ruff_cache/**",
26 "**/*.min.js",
27 )
28
29 # Hard size cap so we don't try to scan 500 MB binaries
30 MAX_FILE_SIZE_BYTES = 2 * 1024 * 1024 # 2 MB
31
32
33 def _matches_any(path: str, globs: Iterable[str]) -> bool:
34 normalized = path.replace(os.sep, "/")
35 return any(fnmatch.fnmatch(normalized, g) for g in globs)
36
37
38 @dataclass
39 class Scanner:
40 """Walks a directory and runs pattern matchers against each file."""
41 excludes: tuple[str, ...] = DEFAULT_EXCLUDES
42 languages: tuple[str, ...] = () # empty = all
43 matchers: list[PatternMatcher] = field(default_factory=list)
44 max_file_size: int = MAX_FILE_SIZE_BYTES
45
46 def __post_init__(self) -> None:
47 if not self.matchers:
48 if self.languages:
49 self.matchers = [
50 MATCHERS_BY_LANGUAGE[lang]
51 for lang in self.languages
52 if lang in MATCHERS_BY_LANGUAGE
53 ]
54 else:
55 self.matchers = list(ALL_MATCHERS)
56
57 def _pick_matcher(self, path: str) -> PatternMatcher | None:
58 for m in self.matchers:
59 if m.matches_file(path):
60 return m
61 return None
62
63 def scan_file(self, file_path: str, root: str | None = None) -> list[Finding]:
64 matcher = self._pick_matcher(file_path)
65 if not matcher:
66 return []
67 try:
68 if os.path.getsize(file_path) > self.max_file_size:
69 return []
70 with open(file_path, "r", encoding="utf-8", errors="replace") as f:
71 content = f.read()
72 except (OSError, UnicodeDecodeError):
73 return []
74
75 rel = os.path.relpath(file_path, root) if root else file_path
76 rel = rel.replace(os.sep, "/")
77 return list(matcher.scan(rel, content, RULE_BY_ID))
78
79 def scan_path(self, path: str) -> ScanReport:
80 started = time.time()
81 report = ScanReport(
82 scan_root=path,
83 started_at=datetime.now(timezone.utc).isoformat(),
84 )
85
86 if os.path.isfile(path):
87 root = os.path.dirname(path) or "."
88 findings = self.scan_file(path, root=root)
89 report.findings.extend(findings)
90 report.files_scanned += 1
91 report.duration_ms = int((time.time() - started) * 1000)
92 return report
93
94 for dirpath, dirnames, filenames in os.walk(path):
95 # prune directories matching excludes
96 kept_dirs = []
97 for d in dirnames:
98 candidate = os.path.join(dirpath, d)
99 if not _matches_any(candidate, self.excludes):
100 kept_dirs.append(d)
101 dirnames[:] = kept_dirs
102
103 for fn in filenames:
104 fp = os.path.join(dirpath, fn)
105 if _matches_any(fp, self.excludes):
106 report.files_skipped += 1
107 continue
108 matcher = self._pick_matcher(fp)
109 if not matcher:
110 report.files_skipped += 1
111 continue
112 findings = self.scan_file(fp, root=path)
113 report.findings.extend(findings)
114 report.files_scanned += 1
115
116 report.duration_ms = int((time.time() - started) * 1000)
117 return report
118