PyTorch-Plugin-FL/
βββ include/ # Public headers
β βββ flagos.h # Unified runtime API (memory, stream, device)
β βββ macros.h # Common macros
βββ accelerator/ # Hardware abstraction layer
β βββ csrc/cuda/ # CUDA runtime implementation
β βββ csrc/maca/ # MACA cudart shim (symbol version compatibility)
β βββ csrc/ascend/ # Ascend runtime (ACL-based memory, stream, device)
βββ csrc/
β βββ aten/ # ATen operator layer
β β βββ common.{h,cc} # Backend config loading, FlagosDevice enum
β β βββ dispatch_stub.h # Lightweight dispatch stub (replaces PyTorch DispatchStub)
β β βββ device_boxing.h # Zero-copy flagosβCUDA tensor metadata conversion
β β βββ register.cc # PrivateUse1 dispatch key registration
β β βββ {op}.{h,cc} # Per-operator stub definitions (add, mm, silu, etc.)
β β βββ factory_ops/ # Basic operators (empty, copy, contiguous, set, fallback)
β β βββ functional_ops/ # Compute operators (mm, bmm, cat, embedding, softmax, etc.)
β β βββ backends/ # Backend-specific kernel implementations
β β βββ cuda/ # CUDA kernels (cuBLAS, modified PyTorch kernels)
β β βββ flagos/ # FlagGems C++ native API wrappers
β β βββ ascend/ # Ascend kernels (ACL NN API)
β βββ runtime/ # Device runtime
β βββ device_allocator # Device memory allocator
β βββ host_allocator # Pinned memory allocator
β βββ guard # DeviceGuard implementation
β βββ generator # RNG generator
β βββ hooks # Runtime hooks
β βββ accelerator/ # Hardware abstraction layer
β βββ cuda/ # CUDA runtime implementation
β βββ maca/ # MACA cudart shim (symbol version compatibility)
β βββ ascend/ # Ascend runtime (ACL-based memory, stream, device)
βββ torch_fl/
β βββ __init__.py # Plugin entry point: register device, load FlagGems operators
β βββ flagos/ # Python device module (stream, event, RNG, AMP)
β βββ accelerator/ # Python accelerator module (MACA shim loader)
β βββ backends.conf # Default backend routing config (CUDA/FlagGems)
β βββ backends_ascend.conf # Ascend backend routing config (all ops β ascend)
β βββ distributed.py # Distributed training support (DDP patch)
β βββ integration.py # FlagGems operator registration logic
β βββ csrc/ # C extension (module.cc, stub.c)
β βββ lib/ # Compiled shared libraries (libtorch_fl.so, libflagos.so)
βββ tests/
β βββ integration/ # Automated integration tests
β β βββ ops/ # Per-operator dispatch tests
β β βββ test_qwen3_*.py # End-to-end model tests
β β βββ conftest.py # Pytest configuration
β βββ manual/ # Manual test scripts
β βββ common/ # Test utilities
βββ debug/ # Development notes and debug scripts
βββ cmake/ # CMake modules
βββ setup.py # CMake build entry point
βββ pyproject.toml