+full refactor

+feat: configuration, progress bar, OSV
2026-01-18 13:54:14 +03:00
parent b8c25b2529
commit a5714116ac
730 changed files with 246974 additions and 150 deletions
@@ -0,0 +1,354 @@
+from __future__ import annotations
+
+import json
+import re
+import urllib.request
+import urllib.error
+from collections import defaultdict
+from typing import List, Dict, Any, Tuple, Optional, Iterable
+
+from .config import OSV_QUERYBATCH_URL, OSV_ECOSYSTEM_MAP
+from .deps_pipeline import dedupe_effective
+from .progress import progress
+
+
+OSV_VULN_URL_TEMPLATE = "https://api.osv.dev/v1/vulns/{id}"
+
+_OSV_RESULT_CACHE: Dict[Tuple[str, str, str], List[str]] = {}
+
+_OSV_VULN_CACHE: Dict[str, Dict[str, Any]] = {}
+
+_SEVERITY_ORDER = {"UNKNOWN": 0, "LOW": 1, "MEDIUM": 2, "HIGH": 3, "CRITICAL": 4}
+
+def _sev_rank(level: str) -> int:
+    return _SEVERITY_ORDER.get(level.upper(), 0)
+
+
+def _score_to_severity(score: float) -> str:
+    if score >= 9.0:
+        return "CRITICAL"
+    if score >= 7.0:
+        return "HIGH"
+    if score >= 4.0:
+        return "MEDIUM"
+    if score > 0.0:
+        return "LOW"
+    return "UNKNOWN"
+
+
+def _normalize_db_specific_severity(s: str) -> str:
+    s = s.strip().upper()
+    if s in {"LOW", "MEDIUM", "HIGH", "CRITICAL"}:
+        return s
+    if s == "MODERATE":
+        return "MEDIUM"
+    return "UNKNOWN"
+
+
+def _compute_vuln_severity(v: Dict[str, Any]) -> str:
+    dbs = (v.get("database_specific") or {})
+    if isinstance(dbs, dict) and dbs.get("severity"):
+        return _normalize_db_specific_severity(str(dbs.get("severity")))
+
+    sev = v.get("severity") or []
+    if isinstance(sev, list):
+        for item in sev:
+            if not isinstance(item, dict):
+                continue
+            score = item.get("score")
+            if score is None:
+                continue
+            try:
+                sc = float(str(score))
+                return _score_to_severity(sc)
+            except ValueError:
+                continue
+
+    return "UNKNOWN"
+
+
+def _http_get_json(url: str, timeout: int = 30) -> Dict[str, Any]:
+    req = urllib.request.Request(url=url, headers={"Accept": "application/json"}, method="GET")
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        body = resp.read().decode("utf-8", errors="ignore")
+        return json.loads(body)
+
+
+def hydrate_vulns(ids: Iterable[str], *, progress_enabled: bool = True, max_ids: int = 300) -> None:
+    uniq: List[str] = []
+    seen = set()
+    for vid in ids:
+        vid = str(vid)
+        if vid in seen:
+            continue
+        seen.add(vid)
+        if vid in _OSV_VULN_CACHE:
+            continue
+        uniq.append(vid)
+        if len(uniq) >= max_ids:
+            break
+
+    it = progress(uniq, total=len(uniq), desc="OSV: загрузка деталей") if progress_enabled else iter(uniq)
+    for vid in it:
+        try:
+            v = _http_get_json(OSV_VULN_URL_TEMPLATE.format(id=vid))
+            if isinstance(v, dict):
+                _OSV_VULN_CACHE[vid] = v
+        except Exception:
+
+            continue
+
+
+def _is_exact_version_for_osv(internal_eco: str, spec: Optional[str], scope: Optional[str]) -> Optional[str]:
+    if not spec:
+        return None
+
+    s = str(spec).strip()
+    sc = (scope or "").lower()
+    eco = internal_eco.lower()
+
+    if sc == "lock":
+        return s.lstrip("=")
+
+    if eco == "go" and sc == "require":
+        return s
+
+    if eco == "pypi":
+        if s.startswith("=="):
+            return s[2:].strip()
+        return None
+
+    if eco == "npm":
+        if any(s.startswith(x) for x in ("^", "~", ">", "<", "*")):
+            return None
+        if re.match(r"^v?\d+(\.\d+){0,3}([\-\+].+)?$", s):
+            return s.lstrip("v")
+        return None
+
+    if eco in {"maven", "gradle", "nuget", "cargo"}:
+        if "${" in s or s.startswith(("(", "[", "{")) or any(op in s for op in (">", "<", "*", ",")):
+            return None
+        if re.match(r"^v?\d+(\.\d+){0,3}([\-\+].+)?$", s):
+            return s.lstrip("v")
+        return None
+
+    return None
+
+
+def _http_post_json(url: str, payload: Dict[str, Any], timeout: int = 30) -> Dict[str, Any]:
+    data = json.dumps(payload).encode("utf-8")
+    req = urllib.request.Request(
+        url=url,
+        data=data,
+        headers={"Content-Type": "application/json", "Accept": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        body = resp.read().decode("utf-8", errors="ignore")
+        return json.loads(body)
+
+
+def querybatch_paginated(queries: List[Dict[str, Any]], max_pages: int = 5) -> List[Dict[str, Any]]:
+    if not queries:
+        return []
+
+    data = _http_post_json(OSV_QUERYBATCH_URL, {"queries": queries})
+    results = data.get("results", [])
+    if not isinstance(results, list):
+        return [{"vulns": []} for _ in queries]
+
+    acc: List[Dict[str, Any]] = [{"vulns": []} for _ in queries]
+    next_tokens: Dict[int, str] = {}
+
+    for i, r in enumerate(results):
+        vulns = (r or {}).get("vulns", []) or []
+        if isinstance(vulns, list):
+            acc[i]["vulns"].extend(vulns)
+        tok = (r or {}).get("next_page_token")
+        if tok:
+            next_tokens[i] = tok
+
+    pages = 1
+    while next_tokens and pages < max_pages:
+        page_queries: List[Dict[str, Any]] = []
+        index_map: List[int] = []
+
+        for orig_idx, tok in next_tokens.items():
+            q = dict(queries[orig_idx])
+            q["page_token"] = tok
+            page_queries.append(q)
+            index_map.append(orig_idx)
+
+        data = _http_post_json(OSV_QUERYBATCH_URL, {"queries": page_queries})
+        page_results = data.get("results", [])
+        new_next: Dict[int, str] = {}
+
+        for j, r in enumerate(page_results):
+            orig_idx = index_map[j]
+            vulns = (r or {}).get("vulns", []) or []
+            if isinstance(vulns, list):
+                acc[orig_idx]["vulns"].extend(vulns)
+            tok = (r or {}).get("next_page_token")
+            if tok:
+                new_next[orig_idx] = tok
+
+        next_tokens = new_next
+        pages += 1
+
+    if next_tokens:
+        for idx in next_tokens.keys():
+            acc[idx]["truncated"] = True
+
+    return acc
+
+
+def annotate_containers_with_osv(
+    containers: List[Dict[str, Any]],
+    *,
+    chunk_size: int = 250,
+    hydrate_details: bool = True,
+    max_hydrate_ids: int = 300,
+    min_severity: str = "MEDIUM",
+    include_unknown: bool = True,
+    progress_enabled: bool = True,
+) -> None:
+    to_query: List[Tuple[str, str, str]] = []
+    for c in containers:
+        deps = c.get("dependencies") or []
+        deps_eff = dedupe_effective(deps)
+
+        pinned: List[Dict[str, str]] = []
+        for d in deps_eff:
+            internal_eco = (d.get("ecosystem") or "").lower()
+            name = d.get("name")
+            spec = d.get("spec")
+            scope = d.get("scope")
+
+            if not internal_eco or not name:
+                continue
+
+            osv_eco = OSV_ECOSYSTEM_MAP.get(internal_eco)
+            if not osv_eco:
+                continue
+
+            ver = _is_exact_version_for_osv(internal_eco, spec, scope)
+            if not ver:
+                continue
+
+            pinned.append({"ecosystem": osv_eco, "name": str(name), "version": str(ver)})
+
+        seen = set()
+        pinned_u: List[Dict[str, str]] = []
+        for p in pinned:
+            k = (p["ecosystem"], p["name"], p["version"])
+            if k in seen:
+                continue
+            seen.add(k)
+            pinned_u.append(p)
+            if k not in _OSV_RESULT_CACHE and k not in to_query:
+                to_query.append(k)
+
+        c["osv_pinned_deps"] = pinned_u
+
+    if not to_query:
+        for c in containers:
+            c["osv_vuln_count"] = 0
+            c["osv_vuln_counts_by_severity"] = {}
+            c["osv_affected_deps"] = []
+            c["osv_vulns_by_dep"] = {}
+        return
+
+    try:
+        total_chunks = (len(to_query) + chunk_size - 1) // chunk_size
+        chunk_indices = range(0, len(to_query), chunk_size)
+        chunk_it = progress(chunk_indices, total=total_chunks, desc="OSV: querybatch") if progress_enabled else iter(chunk_indices)
+
+        for start in chunk_it:
+            batch = to_query[start:start + chunk_size]
+            queries = [
+                {"package": {"ecosystem": eco, "name": name}, "version": ver}
+                for (eco, name, ver) in batch
+            ]
+
+            results = querybatch_paginated(queries)
+
+            for i, r in enumerate(results):
+                eco, name, ver = batch[i]
+                vulns = (r or {}).get("vulns", []) or []
+                ids: List[str] = []
+                for v in vulns:
+                    vid = (v or {}).get("id")
+                    if vid:
+                        ids.append(str(vid))
+
+                seen = set()
+                ids_u: List[str] = []
+                for x in ids:
+                    if x in seen:
+                        continue
+                    seen.add(x)
+                    ids_u.append(x)
+
+                _OSV_RESULT_CACHE[(eco, name, ver)] = ids_u
+
+    except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, ValueError) as e:
+        for c in containers:
+            c["osv_errors"] = f"OSV query failed: {type(e).__name__}: {e}"
+            c["osv_vuln_count"] = 0
+            c["osv_vuln_counts_by_severity"] = {}
+            c["osv_affected_deps"] = []
+            c["osv_vulns_by_dep"] = {}
+        return
+
+    if hydrate_details:
+        all_ids: List[str] = []
+        seen = set()
+        for ids in _OSV_RESULT_CACHE.values():
+            for vid in ids:
+                if vid in seen:
+                    continue
+                seen.add(vid)
+                all_ids.append(vid)
+        hydrate_vulns(all_ids, progress_enabled=progress_enabled, max_ids=max_hydrate_ids)
+
+    threshold = _sev_rank(min_severity)
+
+    for c in containers:
+        pinned = c.get("osv_pinned_deps") or []
+        vulns_by_dep: Dict[str, List[str]] = {}
+        affected: List[Tuple[str, str, str, int, str]] = []
+        all_ids = set()
+        counts: Dict[str, int] = defaultdict(int)
+
+        for p in pinned:
+            eco = p["ecosystem"]
+            name = p["name"]
+            ver = p["version"]
+            ids = _OSV_RESULT_CACHE.get((eco, name, ver), [])
+            if not ids:
+                continue
+
+            kept: List[str] = []
+            max_dep_sev = "UNKNOWN"
+
+            for vid in ids:
+                sev = "UNKNOWN"
+                if hydrate_details and vid in _OSV_VULN_CACHE:
+                    sev = _compute_vuln_severity(_OSV_VULN_CACHE[vid])
+                if _sev_rank(sev) >= threshold or (sev == "UNKNOWN" and include_unknown):
+                    kept.append(vid)
+                    all_ids.add(vid)
+                    counts[sev] += 1
+                    if _sev_rank(sev) > _sev_rank(max_dep_sev):
+                        max_dep_sev = sev
+
+            if kept:
+                key_str = f"{eco}:{name}@{ver}"
+                vulns_by_dep[key_str] = kept
+                affected.append((eco, name, ver, len(kept), max_dep_sev))
+
+        affected.sort(key=lambda x: (x[3], _sev_rank(x[4])), reverse=True)
+        c["osv_vuln_count"] = len(all_ids)
+        c["osv_vuln_counts_by_severity"] = dict(counts)
+        c["osv_affected_deps"] = affected
+        c["osv_vulns_by_dep"] = vulns_by_dep