#!/usr/bin/env python3
# tools/stackusage.py
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.  The
# ASF licenses this file to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance with the
# License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
# License for the specific language governing permissions and limitations
# under the License.
#

"""Static stack usage analyzer using DWARF debug info and disassembly.

Extracts per-function stack frame sizes from DWARF .debug_frame CFA offsets
(via readelf -wF) and builds call graphs from objdump -d disassembly.
Reports worst-case total stack depth through the call chain, flagging
uncertain cases such as recursion and indirect calls (function pointers).

Works with both GNU and LLVM toolchains (readelf/objdump/addr2line).
No Python package dependencies required.
"""

import argparse
import os
import re
import subprocess
import sys

# Call mnemonics across all supported architectures.
# Direct vs indirect is determined by whether objdump annotates
# the target with <function_name>.
CALL_MNEMONICS = {
    "bl",
    "blx",
    "blr",  # ARM / AArch64
    "call",
    "callq",  # x86 / x86_64 / SPARC
    "jal",
    "jalr",  # MIPS / RISC-V
    "bctrl",  # PowerPC
    "jmpl",  # SPARC
    "bsr",
    "jsr",  # SuperH
    "call0",
    "call4",
    "call8",
    "call12",  # Xtensa
    "callx0",
    "callx4",
    "callx8",
    "callx12",  # Xtensa indirect
    "calli",  # TriCore
}

# Function names that indicate dynamic stack allocation.
ALLOCA_NAMES = frozenset(
    {
        "alloca",
        "__alloca",
        "__builtin_alloca",
        "__builtin_alloca_with_align",
    }
)


def build_symbol_map(prefix, elf):
    """Build address<->name maps by parsing ``readelf -sW`` output.

    Returns (addr_to_name, name_to_addr, func_sizes, is_thumb, machine).
    """

    out = subprocess.run(
        [prefix + "readelf", "-hsW", elf],
        capture_output=True,
        text=True,
    )
    if out.returncode != 0:
        return {}, {}, {}, False, ""

    addr_to_name = {}
    name_to_addr = {}
    func_sizes = {}
    is_thumb = False
    machine = ""

    for line in out.stdout.splitlines():
        # Detect machine type from readelf header
        if "Machine:" in line:
            machine = line.split(":", 1)[1].strip()
            if "ARM" in machine and "AArch64" not in machine:
                is_thumb = True
            continue

        # Symbol table line format (readelf -sW):
        #   Num:    Value  Size Type    Bind   Vis      Ndx Name
        #     1: 08000100    64 FUNC    GLOBAL DEFAULT    1 main
        parts = line.split()
        if len(parts) < 8:
            continue
        if parts[3] != "FUNC":
            continue
        if parts[6] == "UND":
            continue

        try:
            addr = int(parts[1], 16)
            size = int(parts[2])
        except ValueError:
            continue

        name = parts[7]
        if name.startswith("$"):
            continue

        if is_thumb:
            addr = addr & ~1

        if addr in addr_to_name and len(addr_to_name[addr]) >= len(name):
            continue

        addr_to_name[addr] = name
        name_to_addr[name] = addr
        func_sizes[addr] = size

    return addr_to_name, name_to_addr, func_sizes, is_thumb, machine


def parse_stack_sizes(prefix, elf, addr_to_name, name_to_addr, func_sizes):
    """Parse ``readelf -wF`` output for per-function CFA stack sizes.

    Returns funcs dict: name -> {file, line, self, code_size, qualifier,
    reasons}.
    """

    out = subprocess.run(
        [prefix + "readelf", "-wF", elf],
        capture_output=True,
        text=True,
    )

    funcs = {}
    current_addr = None
    max_offset = 0

    for line in out.stdout.splitlines():
        # FDE header: look for pc=ADDR..ADDR
        m = re.search(r"pc=([0-9a-f]+)\.\.", line, re.IGNORECASE)
        if m:
            # Flush previous FDE
            if current_addr is not None:
                name = addr_to_name.get(current_addr)
                if name and name not in funcs:
                    code_size = func_sizes.get(current_addr, 0)
                    if not code_size:
                        sym_addr = name_to_addr.get(name)
                        if sym_addr is not None:
                            code_size = func_sizes.get(sym_addr, 0)
                    funcs[name] = {
                        "file": "?",
                        "line": "?",
                        "self": max_offset,
                        "code_size": code_size,
                        "qualifier": "static",
                        "reasons": [],
                    }
            current_addr = int(m.group(1), 16)
            # Also try with Thumb bit cleared
            if current_addr not in addr_to_name:
                alt = current_addr & ~1
                if alt in addr_to_name:
                    current_addr = alt
            max_offset = 0
            continue

        # CFA row: extract offset from e.g. "sp+16" or "r13+24"
        m = re.search(r"[a-zA-Z]\w*\+(\d+)", line)
        if m and current_addr is not None:
            val = int(m.group(1))
            if val > max_offset:
                max_offset = val

    # Flush last FDE
    if current_addr is not None:
        name = addr_to_name.get(current_addr)
        if name and name not in funcs:
            code_size = func_sizes.get(current_addr, 0)
            if not code_size:
                sym_addr = name_to_addr.get(name)
                if sym_addr is not None:
                    code_size = func_sizes.get(sym_addr, 0)
            funcs[name] = {
                "file": "?",
                "line": "?",
                "self": max_offset,
                "code_size": code_size,
                "qualifier": "static",
                "reasons": [],
            }

    return funcs


def resolve_sources(prefix, elf, funcs, name_to_addr):
    """Batch-resolve source file:line via ``addr2line -fe``."""

    addrs = []
    names = []
    for name in funcs:
        addr = name_to_addr.get(name)
        if addr is not None:
            addrs.append("%x" % addr)
            names.append(name)

    if not addrs:
        return

    out = subprocess.run(
        [prefix + "addr2line", "-fe", elf],
        capture_output=True,
        text=True,
        input="\n".join(addrs) + "\n",
    )

    lines = out.stdout.splitlines()
    # addr2line outputs two lines per address: function_name, file:line
    for i in range(0, min(len(lines), len(names) * 2), 2):
        idx = i // 2
        if idx >= len(names):
            break
        if i + 1 < len(lines):
            loc = lines[i + 1]
            if loc and loc != "??:0" and loc != "??:?":
                parts = loc.rsplit(":", 1)
                if len(parts) == 2:
                    funcs[names[idx]]["file"] = parts[0]
                    funcs[names[idx]]["line"] = parts[1]


def _sp_pattern(machine):
    """Return a compiled regex matching the stack pointer register.

    Architecture-specific SP register names:
      ARM:      sp, r13
      AArch64:  sp
      x86:      %esp, %rsp
      MIPS:     $sp, $29
      RISC-V:   sp, x2
      Xtensa:   a1
      PowerPC:  r1
      SPARC:    %sp, %o6
      SuperH:   r15
      TriCore:  a10, sp
    """

    m = machine.upper()
    if "ARM" in m and "AARCH64" not in m:
        return re.compile(r"\bsp\b|\br13\b", re.IGNORECASE)
    if "AARCH64" in m:
        return re.compile(r"\bsp\b", re.IGNORECASE)
    if "80386" in m or "X86" in m or "AMD" in m:
        return re.compile(r"%[er]sp\b", re.IGNORECASE)
    if "MIPS" in m:
        return re.compile(r"\$sp\b|\$29\b", re.IGNORECASE)
    if "RISC-V" in m:
        return re.compile(r"\bsp\b|\bx2\b", re.IGNORECASE)
    if "XTENSA" in m or "TENSILICA" in m:
        return re.compile(r"\ba1\b", re.IGNORECASE)
    if "POWER" in m or "PPC" in m:
        return re.compile(r"\br1\b", re.IGNORECASE)
    if "SPARC" in m:
        return re.compile(r"\bsp\b|%sp\b|%o6\b", re.IGNORECASE)
    if "SH" in m or "SUPER" in m or "RENESAS" in m:
        return re.compile(r"\br15\b", re.IGNORECASE)
    if "TRICORE" in m:
        return re.compile(r"\ba10\b|\bsp\b", re.IGNORECASE)
    # Fallback: common SP names
    return re.compile(r"\bsp\b", re.IGNORECASE)


def _is_dynamic_sp_sub(mnemonic, operand_str, sp_re):
    """Return True if instruction dynamically adjusts the stack pointer.

    Detects VLA / alloca patterns where a sub instruction adjusts SP
    by a register operand (not an immediate constant).
    """

    if not mnemonic.startswith("sub"):
        return False
    ops = operand_str.lower()
    if not sp_re.search(ops):
        return False
    # ARM/AArch64: immediates use # prefix
    if "#" in ops:
        return False
    # x86 AT&T syntax: $ means immediate
    if "%" in ops and "$" in ops:
        return False
    return True


def parse_call_graph(prefix, elf, machine):
    """Build call graph by parsing ``objdump -d`` output.

    Returns (graph, dynamic_funcs) where:
      graph: dict mapping caller -> set of (callee_name_or_None, is_indirect).
      dynamic_funcs: set of function names with dynamic stack (alloca/VLA).
    """

    sp_re = _sp_pattern(machine)
    graph = {}
    dynamic_funcs = set()
    current_func = None

    proc = subprocess.Popen(
        [prefix + "objdump", "-d", elf],
        stdout=subprocess.PIPE,
        stderr=subprocess.DEVNULL,
        text=True,
    )

    for line in proc.stdout:
        # Function boundary: "0000abcd <func_name>:"
        m = re.match(r"^[0-9a-f]+ <(.+)>:\s*$", line)
        if m:
            current_func = m.group(1)
            graph.setdefault(current_func, set())
            continue

        if current_func is None:
            continue

        # Instruction line: "   addr:  bytes  mnemonic  operands"
        parts = line.split("\t")
        if len(parts) < 3:
            continue

        mnemonic = parts[2].strip().split()[0].lower()

        # Extract operand string (used by both call and dynamic-SP checks)
        operands = parts[2].strip().split(None, 1)
        operand_str = operands[1] if len(operands) > 1 else ""
        if len(parts) > 3:
            operand_str += " " + parts[3]

        # Check for call instructions
        if mnemonic in CALL_MNEMONICS:
            target_m = re.search(r"<([^>+]+)>", operand_str)
            if target_m:
                callee = target_m.group(1)
                if callee in ALLOCA_NAMES:
                    dynamic_funcs.add(current_func)
                else:
                    graph[current_func].add((callee, False))
            else:
                graph[current_func].add((None, True))
            continue

        # Check for dynamic stack pointer adjustment (VLA / inlined alloca)
        if _is_dynamic_sp_sub(mnemonic, operand_str, sp_re):
            dynamic_funcs.add(current_func)

    proc.wait()
    return graph, dynamic_funcs


def compute_worst_stack(funcs, graph, recursion_depth):
    """Compute worst-case total stack for every function via memoized DFS.

    Args:
        funcs: dict from parse_stack_sizes
        graph: dict from parse_call_graph
        recursion_depth: how many times a recursive cycle body is counted
                         (0 = back-edges contribute nothing)

    Returns dict mapping function name to result dict with keys:
        self, total, uncertain, reasons, stack, self_reasons
    """

    cache = {}

    def dfs(func, path):
        if func in cache:
            return cache[func]

        info = funcs.get(func)
        self_size = info["self"] if info else 0
        self_reasons = set(info["reasons"]) if info else {"no DWARF data"}
        reasons = set(self_reasons)
        uncertain = bool(reasons)

        callees = graph.get(func, set())
        worst_callee = 0
        best_substack = []

        for callee, is_indirect in callees:
            if is_indirect:
                uncertain = True
                reasons.add("indirect call (function pointer)")
                self_reasons.add("indirect call (function pointer)")
                continue

            cur_path = path + [func]
            if callee in cur_path:
                # Recursion detected
                uncertain = True
                idx = cur_path.index(callee)
                cycle = cur_path[idx:] + [callee]
                reason = "recursion: %s" % "->".join(cycle)
                reasons.add(reason)
                self_reasons.add(reason)
                if recursion_depth > 0:
                    cycle_nodes = cycle[:-1]
                    cycle_cost = sum(
                        funcs[c]["self"] if c in funcs else 0 for c in cycle_nodes
                    )
                    cycle_total = cycle_cost * recursion_depth
                    if cycle_total > worst_callee:
                        worst_callee = cycle_total
                        est_frames = []
                        for _ in range(recursion_depth):
                            for node in cycle_nodes:
                                est_frames.append((node, "recursive estimate"))
                        best_substack = est_frames
                continue

            callee_total, callee_unc, callee_reasons, callee_stack, _ = dfs(
                callee, cur_path
            )
            if callee_unc:
                uncertain = True
                reasons.update(callee_reasons)
            if callee_total > worst_callee:
                worst_callee = callee_total
                best_substack = callee_stack

        total = self_size + worst_callee
        result = (
            total,
            uncertain,
            reasons,
            [(func, None)] + best_substack,
            self_reasons,
        )
        cache[func] = result
        return result

    results = {}
    all_funcs = set(funcs.keys()) | set(graph.keys())
    for func in all_funcs:
        total, uncertain, reasons, stack, self_reasons = dfs(func, [])
        # Skip symbols with no DWARF data and zero stack
        if func not in funcs and total == 0:
            continue
        results[func] = {
            "self": funcs[func]["self"] if func in funcs else 0,
            "total": total,
            "uncertain": uncertain,
            "reasons": list(reasons),
            "stack": stack,
            "self_reasons": list(self_reasons),
        }

    return results


def reason_marker(reasons):
    """Return a single-char marker for uncertainty.

    Marker priority (highest first):
      ? = no DWARF data
      * = dynamic stack (alloca/VLA)
      @ = recursion
      ^ = indirect call (function pointer)
    """

    for reason in reasons:
        if reason == "no DWARF data":
            return "?"
    for reason in reasons:
        if reason.startswith("dynamic stack"):
            return "*"
    for reason in reasons:
        if reason.startswith("recursion:"):
            return "@"
    for reason in reasons:
        if reason == "indirect call (function pointer)":
            return "^"
    return ""


def frame_info(funcs, results, frame):
    """Return (size_str, func_name, file:line) for one stack frame.

    size_str is prefixed with a marker indicating the function's own
    uncertainty type (see reason_marker).  func_name includes the code
    size in parentheses when available, e.g. ``main(128)``.
    """

    func, _note = frame
    info = funcs.get(func)
    res = results.get(func)
    if info:
        self_reasons = res["self_reasons"] if res else info["reasons"]
        prefix = reason_marker(self_reasons)
        code_size = info.get("code_size", 0)
        name = "%s(%d)" % (func, code_size) if code_size else func
        return (
            "%s%d" % (prefix, info["self"]),
            name,
            "%s:%s" % (info["file"], info["line"]),
        )
    return ("?", func, "?:?")


def format_text(results, funcs, rank):
    """Format results as aligned table with deepest call stack frames."""

    # Compute common path prefix to shorten File:Line column
    paths = [
        f["file"]
        for f in funcs.values()
        if f["file"] not in ("?", "") and f["file"].startswith("/")
    ]
    if paths:
        prefix = os.path.commonpath(paths)
        if not prefix.endswith("/"):
            prefix += "/"
    else:
        prefix = ""

    maxname = 40
    items = sorted(results.items(), key=lambda x: x[1]["total"], reverse=True)
    if rank > 0:
        items = items[:rank]

    hdr = ("Total", "Self", "Backtrace", "File:Line")
    w = list(map(len, hdr))

    rows = []
    for func, info in items:
        frames = [frame_info(funcs, results, f) for f in info["stack"]]
        # Strip common prefix from file paths
        if prefix:
            plen = len(prefix)
            frames = [
                (
                    sz,
                    fn,
                    loc[plen:] if loc.startswith(prefix) else loc,
                )
                for sz, fn, loc in frames
            ]
        # Derive Total marker from the displayed stack frames' own reasons,
        # not from all propagated reasons across every branch.
        stack_reasons = []
        for f, _note in info["stack"]:
            res = results.get(f)
            if res:
                stack_reasons.extend(res["self_reasons"])
            elif f not in funcs:
                stack_reasons.append("no DWARF data")
        marker = reason_marker(stack_reasons) if info["uncertain"] else ""
        total_str = "%s%d" % (marker, info["total"])
        w[0] = max(w[0], len(total_str))
        for sz, fn, loc in frames:
            w[1] = max(w[1], len(sz))
            w[2] = max(w[2], min(len(fn), maxname))
            w[3] = max(w[3], len(loc))
        rows.append((total_str, frames))

    def trunc(s):
        return s[: maxname - 3] + "..." if len(s) > maxname else s

    fmt = "%%%ds  %%%ds  %%-%ds  %%s" % (w[0], w[1], w[2])
    lines = [fmt % hdr]
    lines.append(fmt % ("-" * w[0], "-" * w[1], "-" * w[2], "-" * w[3]))

    for total_str, frames in rows:
        for i, (sz, fn, loc) in enumerate(frames):
            if i == 0:
                lines.append(fmt % (total_str, sz, trunc(fn + "~"), loc))
            else:
                lines.append(fmt % ("", sz, trunc(fn), loc))
        if not frames:
            lines.append(fmt % (total_str, "", "", ""))

    return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser(
        description="Analyze per-function stack usage from DWARF debug info, "
        "combined with disassembly-based call graph analysis to compute "
        "worst-case total (self + callees) stack depth.",
    )
    parser.add_argument(
        "elf",
        help="path to ELF file with DWARF debug info",
    )
    parser.add_argument(
        "-p",
        "--prefix",
        default="",
        help="toolchain prefix (e.g. arm-none-eabi- or llvm-)",
    )
    parser.add_argument(
        "-n",
        "--rank",
        type=int,
        default=0,
        help="show top N functions (default: 0 = all)",
    )
    parser.add_argument(
        "-r",
        "--recursion-depth",
        type=int,
        default=0,
        help="assumed recursion depth for stack estimation (default: 0). "
        "When > 0, recursive cycle cost is multiplied by this value.",
    )
    args = parser.parse_args()

    if not os.path.isfile(args.elf):
        print("ELF file not found: %s" % args.elf, file=sys.stderr)
        sys.exit(1)

    # Phase 1: Symbol table
    addr_to_name, name_to_addr, func_sizes, is_thumb, machine = build_symbol_map(
        args.prefix,
        args.elf,
    )
    if not addr_to_name:
        print("No function symbols found in ELF", file=sys.stderr)
        sys.exit(1)

    # Phase 2: Stack sizes from DWARF .debug_frame
    funcs = parse_stack_sizes(
        args.prefix,
        args.elf,
        addr_to_name,
        name_to_addr,
        func_sizes,
    )
    if not funcs:
        print(
            "No stack size data found in DWARF .debug_frame",
            file=sys.stderr,
        )
        sys.exit(1)

    # Phase 3: Source locations via addr2line
    resolve_sources(args.prefix, args.elf, funcs, name_to_addr)

    # Phase 4: Call graph via objdump
    graph, dynamic_funcs = parse_call_graph(args.prefix, args.elf, machine)

    # Mark functions with dynamic stack allocation (alloca/VLA)
    for name in dynamic_funcs:
        if name in funcs:
            funcs[name]["reasons"].append("dynamic stack (alloca/VLA)")

    results = compute_worst_stack(funcs, graph, args.recursion_depth)

    print(format_text(results, funcs, args.rank))


if __name__ == "__main__":
    main()