+full refactor

+feat: configuration, progress bar, OSV
This commit is contained in:
2026-01-18 13:54:14 +03:00
parent b8c25b2529
commit a5714116ac
730 changed files with 246974 additions and 150 deletions
+354
View File
@@ -0,0 +1,354 @@
from __future__ import annotations
import json
import re
import urllib.request
import urllib.error
from collections import defaultdict
from typing import List, Dict, Any, Tuple, Optional, Iterable
from .config import OSV_QUERYBATCH_URL, OSV_ECOSYSTEM_MAP
from .deps_pipeline import dedupe_effective
from .progress import progress
OSV_VULN_URL_TEMPLATE = "https://api.osv.dev/v1/vulns/{id}"
_OSV_RESULT_CACHE: Dict[Tuple[str, str, str], List[str]] = {}
_OSV_VULN_CACHE: Dict[str, Dict[str, Any]] = {}
_SEVERITY_ORDER = {"UNKNOWN": 0, "LOW": 1, "MEDIUM": 2, "HIGH": 3, "CRITICAL": 4}
def _sev_rank(level: str) -> int:
return _SEVERITY_ORDER.get(level.upper(), 0)
def _score_to_severity(score: float) -> str:
if score >= 9.0:
return "CRITICAL"
if score >= 7.0:
return "HIGH"
if score >= 4.0:
return "MEDIUM"
if score > 0.0:
return "LOW"
return "UNKNOWN"
def _normalize_db_specific_severity(s: str) -> str:
s = s.strip().upper()
if s in {"LOW", "MEDIUM", "HIGH", "CRITICAL"}:
return s
if s == "MODERATE":
return "MEDIUM"
return "UNKNOWN"
def _compute_vuln_severity(v: Dict[str, Any]) -> str:
dbs = (v.get("database_specific") or {})
if isinstance(dbs, dict) and dbs.get("severity"):
return _normalize_db_specific_severity(str(dbs.get("severity")))
sev = v.get("severity") or []
if isinstance(sev, list):
for item in sev:
if not isinstance(item, dict):
continue
score = item.get("score")
if score is None:
continue
try:
sc = float(str(score))
return _score_to_severity(sc)
except ValueError:
continue
return "UNKNOWN"
def _http_get_json(url: str, timeout: int = 30) -> Dict[str, Any]:
req = urllib.request.Request(url=url, headers={"Accept": "application/json"}, method="GET")
with urllib.request.urlopen(req, timeout=timeout) as resp:
body = resp.read().decode("utf-8", errors="ignore")
return json.loads(body)
def hydrate_vulns(ids: Iterable[str], *, progress_enabled: bool = True, max_ids: int = 300) -> None:
uniq: List[str] = []
seen = set()
for vid in ids:
vid = str(vid)
if vid in seen:
continue
seen.add(vid)
if vid in _OSV_VULN_CACHE:
continue
uniq.append(vid)
if len(uniq) >= max_ids:
break
it = progress(uniq, total=len(uniq), desc="OSV: загрузка деталей") if progress_enabled else iter(uniq)
for vid in it:
try:
v = _http_get_json(OSV_VULN_URL_TEMPLATE.format(id=vid))
if isinstance(v, dict):
_OSV_VULN_CACHE[vid] = v
except Exception:
continue
def _is_exact_version_for_osv(internal_eco: str, spec: Optional[str], scope: Optional[str]) -> Optional[str]:
if not spec:
return None
s = str(spec).strip()
sc = (scope or "").lower()
eco = internal_eco.lower()
if sc == "lock":
return s.lstrip("=")
if eco == "go" and sc == "require":
return s
if eco == "pypi":
if s.startswith("=="):
return s[2:].strip()
return None
if eco == "npm":
if any(s.startswith(x) for x in ("^", "~", ">", "<", "*")):
return None
if re.match(r"^v?\d+(\.\d+){0,3}([\-\+].+)?$", s):
return s.lstrip("v")
return None
if eco in {"maven", "gradle", "nuget", "cargo"}:
if "${" in s or s.startswith(("(", "[", "{")) or any(op in s for op in (">", "<", "*", ",")):
return None
if re.match(r"^v?\d+(\.\d+){0,3}([\-\+].+)?$", s):
return s.lstrip("v")
return None
return None
def _http_post_json(url: str, payload: Dict[str, Any], timeout: int = 30) -> Dict[str, Any]:
data = json.dumps(payload).encode("utf-8")
req = urllib.request.Request(
url=url,
data=data,
headers={"Content-Type": "application/json", "Accept": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=timeout) as resp:
body = resp.read().decode("utf-8", errors="ignore")
return json.loads(body)
def querybatch_paginated(queries: List[Dict[str, Any]], max_pages: int = 5) -> List[Dict[str, Any]]:
if not queries:
return []
data = _http_post_json(OSV_QUERYBATCH_URL, {"queries": queries})
results = data.get("results", [])
if not isinstance(results, list):
return [{"vulns": []} for _ in queries]
acc: List[Dict[str, Any]] = [{"vulns": []} for _ in queries]
next_tokens: Dict[int, str] = {}
for i, r in enumerate(results):
vulns = (r or {}).get("vulns", []) or []
if isinstance(vulns, list):
acc[i]["vulns"].extend(vulns)
tok = (r or {}).get("next_page_token")
if tok:
next_tokens[i] = tok
pages = 1
while next_tokens and pages < max_pages:
page_queries: List[Dict[str, Any]] = []
index_map: List[int] = []
for orig_idx, tok in next_tokens.items():
q = dict(queries[orig_idx])
q["page_token"] = tok
page_queries.append(q)
index_map.append(orig_idx)
data = _http_post_json(OSV_QUERYBATCH_URL, {"queries": page_queries})
page_results = data.get("results", [])
new_next: Dict[int, str] = {}
for j, r in enumerate(page_results):
orig_idx = index_map[j]
vulns = (r or {}).get("vulns", []) or []
if isinstance(vulns, list):
acc[orig_idx]["vulns"].extend(vulns)
tok = (r or {}).get("next_page_token")
if tok:
new_next[orig_idx] = tok
next_tokens = new_next
pages += 1
if next_tokens:
for idx in next_tokens.keys():
acc[idx]["truncated"] = True
return acc
def annotate_containers_with_osv(
containers: List[Dict[str, Any]],
*,
chunk_size: int = 250,
hydrate_details: bool = True,
max_hydrate_ids: int = 300,
min_severity: str = "MEDIUM",
include_unknown: bool = True,
progress_enabled: bool = True,
) -> None:
to_query: List[Tuple[str, str, str]] = []
for c in containers:
deps = c.get("dependencies") or []
deps_eff = dedupe_effective(deps)
pinned: List[Dict[str, str]] = []
for d in deps_eff:
internal_eco = (d.get("ecosystem") or "").lower()
name = d.get("name")
spec = d.get("spec")
scope = d.get("scope")
if not internal_eco or not name:
continue
osv_eco = OSV_ECOSYSTEM_MAP.get(internal_eco)
if not osv_eco:
continue
ver = _is_exact_version_for_osv(internal_eco, spec, scope)
if not ver:
continue
pinned.append({"ecosystem": osv_eco, "name": str(name), "version": str(ver)})
seen = set()
pinned_u: List[Dict[str, str]] = []
for p in pinned:
k = (p["ecosystem"], p["name"], p["version"])
if k in seen:
continue
seen.add(k)
pinned_u.append(p)
if k not in _OSV_RESULT_CACHE and k not in to_query:
to_query.append(k)
c["osv_pinned_deps"] = pinned_u
if not to_query:
for c in containers:
c["osv_vuln_count"] = 0
c["osv_vuln_counts_by_severity"] = {}
c["osv_affected_deps"] = []
c["osv_vulns_by_dep"] = {}
return
try:
total_chunks = (len(to_query) + chunk_size - 1) // chunk_size
chunk_indices = range(0, len(to_query), chunk_size)
chunk_it = progress(chunk_indices, total=total_chunks, desc="OSV: querybatch") if progress_enabled else iter(chunk_indices)
for start in chunk_it:
batch = to_query[start:start + chunk_size]
queries = [
{"package": {"ecosystem": eco, "name": name}, "version": ver}
for (eco, name, ver) in batch
]
results = querybatch_paginated(queries)
for i, r in enumerate(results):
eco, name, ver = batch[i]
vulns = (r or {}).get("vulns", []) or []
ids: List[str] = []
for v in vulns:
vid = (v or {}).get("id")
if vid:
ids.append(str(vid))
seen = set()
ids_u: List[str] = []
for x in ids:
if x in seen:
continue
seen.add(x)
ids_u.append(x)
_OSV_RESULT_CACHE[(eco, name, ver)] = ids_u
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, ValueError) as e:
for c in containers:
c["osv_errors"] = f"OSV query failed: {type(e).__name__}: {e}"
c["osv_vuln_count"] = 0
c["osv_vuln_counts_by_severity"] = {}
c["osv_affected_deps"] = []
c["osv_vulns_by_dep"] = {}
return
if hydrate_details:
all_ids: List[str] = []
seen = set()
for ids in _OSV_RESULT_CACHE.values():
for vid in ids:
if vid in seen:
continue
seen.add(vid)
all_ids.append(vid)
hydrate_vulns(all_ids, progress_enabled=progress_enabled, max_ids=max_hydrate_ids)
threshold = _sev_rank(min_severity)
for c in containers:
pinned = c.get("osv_pinned_deps") or []
vulns_by_dep: Dict[str, List[str]] = {}
affected: List[Tuple[str, str, str, int, str]] = []
all_ids = set()
counts: Dict[str, int] = defaultdict(int)
for p in pinned:
eco = p["ecosystem"]
name = p["name"]
ver = p["version"]
ids = _OSV_RESULT_CACHE.get((eco, name, ver), [])
if not ids:
continue
kept: List[str] = []
max_dep_sev = "UNKNOWN"
for vid in ids:
sev = "UNKNOWN"
if hydrate_details and vid in _OSV_VULN_CACHE:
sev = _compute_vuln_severity(_OSV_VULN_CACHE[vid])
if _sev_rank(sev) >= threshold or (sev == "UNKNOWN" and include_unknown):
kept.append(vid)
all_ids.add(vid)
counts[sev] += 1
if _sev_rank(sev) > _sev_rank(max_dep_sev):
max_dep_sev = sev
if kept:
key_str = f"{eco}:{name}@{ver}"
vulns_by_dep[key_str] = kept
affected.append((eco, name, ver, len(kept), max_dep_sev))
affected.sort(key=lambda x: (x[3], _sev_rank(x[4])), reverse=True)
c["osv_vuln_count"] = len(all_ids)
c["osv_vuln_counts_by_severity"] = dict(counts)
c["osv_affected_deps"] = affected
c["osv_vulns_by_dep"] = vulns_by_dep