From d70e7ea0d1e556037ef5b6874734cf51396b033b Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Wed, 22 Mar 2023 01:23:19 +0000 Subject: [PATCH] [amdgpu][nfc] Extract more functions in LowerModuleLDS, mark more methods static --- .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 348 ++++++++++-------- 1 file changed, 193 insertions(+), 155 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 455d76b0cecd..274597cb5607 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -631,7 +631,7 @@ public: return tableKernelIndexCache[F]; } - std::vector assignLDSKernelIDToEachKernel( + static std::vector assignLDSKernelIDToEachKernel( Module *M, DenseSet const &KernelsThatAllocateTableLDS) { // Associate kernels in the set with an arbirary but reproducible order and // annotate them with that order in metadata. This metadata is recognised by @@ -680,168 +680,157 @@ public: return OrderedKernels; } - bool runOnModule(Module &M) override { - LLVMContext &Ctx = M.getContext(); - CallGraph CG = CallGraph(M); - bool Changed = superAlignLDSGlobals(M); + static void partitionVariablesIntoIndirectStrategies( + Module &M, LDSUsesInfoTy const &LDSUsesInfo, + VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly, + DenseSet &ModuleScopeVariables, + DenseSet &TableLookupVariables, + DenseSet &KernelAccessVariables) { - Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + GlobalVariable *HybridModuleRoot = + LoweringKindLoc != LoweringKind::hybrid + ? nullptr + : chooseBestVariableForModuleStrategy( + M.getDataLayout(), LDSToKernelsThatNeedToAccessItIndirectly); - Changed = true; // todo: narrow this down + DenseSet const EmptySet; + DenseSet const &HybridModuleRootKernels = + HybridModuleRoot + ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] + : EmptySet; - // For each kernel, what variables does it access directly or through - // callees - LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { + // Each iteration of this loop assigns exactly one global variable to + // exactly one of the implementation strategies. - // For each variable accessed through callees, which kernels access it - VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; - for (auto &K : LDSUsesInfo.indirect_access) { - Function *F = K.first; - assert(isKernelLDS(F)); - for (GlobalVariable *GV : K.second) { - LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); - } - } + GlobalVariable *GV = K.first; + assert(AMDGPU::isLDSVariableToLower(*GV)); + assert(K.second.size() != 0); - // Partition variables accessed indirectly into the different strategies - DenseSet ModuleScopeVariables; - DenseSet TableLookupVariables; - DenseSet KernelAccessVariables; + switch (LoweringKindLoc) { + case LoweringKind::module: + ModuleScopeVariables.insert(GV); + break; - { - GlobalVariable *HybridModuleRoot = - LoweringKindLoc != LoweringKind::hybrid - ? nullptr - : chooseBestVariableForModuleStrategy( - M.getDataLayout(), - LDSToKernelsThatNeedToAccessItIndirectly); - - DenseSet const EmptySet; - DenseSet const &HybridModuleRootKernels = - HybridModuleRoot - ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] - : EmptySet; - - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { - // Each iteration of this loop assigns exactly one global variable to - // exactly one of the implementation strategies. - - GlobalVariable *GV = K.first; - assert(AMDGPU::isLDSVariableToLower(*GV)); - assert(K.second.size() != 0); - - switch (LoweringKindLoc) { - case LoweringKind::module: - ModuleScopeVariables.insert(GV); - break; - - case LoweringKind::table: - TableLookupVariables.insert(GV); - break; - - case LoweringKind::kernel: - if (K.second.size() == 1) { - KernelAccessVariables.insert(GV); - } else { - report_fatal_error( - "cannot lower LDS '" + GV->getName() + - "' to kernel access as it is reachable from multiple kernels"); - } - break; - - case LoweringKind::hybrid: { - if (GV == HybridModuleRoot) { - assert(K.second.size() != 1); - ModuleScopeVariables.insert(GV); - } else if (K.second.size() == 1) { - KernelAccessVariables.insert(GV); - } else if (set_is_subset(K.second, HybridModuleRootKernels)) { - ModuleScopeVariables.insert(GV); - } else { - TableLookupVariables.insert(GV); - } - break; - } - } - } - - // All LDS variables accessed indirectly have now been partitioned into - // the distinct lowering strategies. - assert(ModuleScopeVariables.size() + TableLookupVariables.size() + - KernelAccessVariables.size() == - LDSToKernelsThatNeedToAccessItIndirectly.size()); - } - - // If the kernel accesses a variable that is going to be stored in the - // module instance through a call then that kernel needs to allocate the - // module instance - DenseSet KernelsThatAllocateModuleLDS = - kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, - ModuleScopeVariables); - DenseSet KernelsThatAllocateTableLDS = - kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, - TableLookupVariables); - - GlobalVariable *MaybeModuleScopeStruct = nullptr; - if (!ModuleScopeVariables.empty()) { - LDSVariableReplacement ModuleScopeReplacement = - createLDSVariableReplacement(M, "llvm.amdgcn.module.lds", - ModuleScopeVariables); - MaybeModuleScopeStruct = ModuleScopeReplacement.SGV; - appendToCompilerUsed(M, - {static_cast( - ConstantExpr::getPointerBitCastOrAddrSpaceCast( - cast(ModuleScopeReplacement.SGV), - Type::getInt8PtrTy(Ctx)))}); - - // module.lds will be allocated at zero in any kernel that allocates it - recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0); - - // historic - removeLocalVarsFromUsedLists(M, ModuleScopeVariables); - - // Replace all uses of module scope variable from non-kernel functions - replaceLDSVariablesWithStruct( - M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { - Instruction *I = dyn_cast(U.getUser()); - if (!I) { - return false; - } - Function *F = I->getFunction(); - return !isKernelLDS(F); - }); - - // Replace uses of module scope variable from kernel functions that - // allocate the module scope variable, otherwise leave them unchanged - // Record on each kernel whether the module scope global is used by it - - LLVMContext &Ctx = M.getContext(); - IRBuilder<> Builder(Ctx); - - for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) - continue; - - if (KernelsThatAllocateModuleLDS.contains(&Func)) { - replaceLDSVariablesWithStruct( - M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { - Instruction *I = dyn_cast(U.getUser()); - if (!I) { - return false; - } - Function *F = I->getFunction(); - return F == &Func; - }); - - markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV); + case LoweringKind::table: + TableLookupVariables.insert(GV); + break; + case LoweringKind::kernel: + if (K.second.size() == 1) { + KernelAccessVariables.insert(GV); } else { - Func.addFnAttr("amdgpu-elide-module-lds"); + report_fatal_error( + "cannot lower LDS '" + GV->getName() + + "' to kernel access as it is reachable from multiple kernels"); } + break; + + case LoweringKind::hybrid: { + if (GV == HybridModuleRoot) { + assert(K.second.size() != 1); + ModuleScopeVariables.insert(GV); + } else if (K.second.size() == 1) { + KernelAccessVariables.insert(GV); + } else if (set_is_subset(K.second, HybridModuleRootKernels)) { + ModuleScopeVariables.insert(GV); + } else { + TableLookupVariables.insert(GV); + } + break; + } } } - // Create a struct for each kernel for the non-module-scope variables + // All LDS variables accessed indirectly have now been partitioned into + // the distinct lowering strategies. + assert(ModuleScopeVariables.size() + TableLookupVariables.size() + + KernelAccessVariables.size() == + LDSToKernelsThatNeedToAccessItIndirectly.size()); + } + + static GlobalVariable *lowerModuleScopeStructVariables( + Module &M, DenseSet const &ModuleScopeVariables, + DenseSet const &KernelsThatAllocateModuleLDS) { + // Create a struct to hold the ModuleScopeVariables + // Replace all uses of those variables from non-kernel functions with the + // new struct instance Replace only the uses from kernel functions that will + // allocate this instance. That is a space optimisation - kernels that use a + // subset of the module scope struct and do not need to allocate it for + // indirect calls will only allocate the subset they use (they do so as part + // of the per-kernel lowering). + if (ModuleScopeVariables.empty()) { + return nullptr; + } + + LLVMContext &Ctx = M.getContext(); + + LDSVariableReplacement ModuleScopeReplacement = + createLDSVariableReplacement(M, "llvm.amdgcn.module.lds", + ModuleScopeVariables); + + appendToCompilerUsed(M, {static_cast( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast(ModuleScopeReplacement.SGV), + Type::getInt8PtrTy(Ctx)))}); + + // module.lds will be allocated at zero in any kernel that allocates it + recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0); + + // historic + removeLocalVarsFromUsedLists(M, ModuleScopeVariables); + + // Replace all uses of module scope variable from non-kernel functions + replaceLDSVariablesWithStruct( + M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { + Instruction *I = dyn_cast(U.getUser()); + if (!I) { + return false; + } + Function *F = I->getFunction(); + return !isKernelLDS(F); + }); + + // Replace uses of module scope variable from kernel functions that + // allocate the module scope variable, otherwise leave them unchanged + // Record on each kernel whether the module scope global is used by it + + IRBuilder<> Builder(Ctx); + + for (Function &Func : M.functions()) { + if (Func.isDeclaration() || !isKernelLDS(&Func)) + continue; + + if (KernelsThatAllocateModuleLDS.contains(&Func)) { + replaceLDSVariablesWithStruct( + M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { + Instruction *I = dyn_cast(U.getUser()); + if (!I) { + return false; + } + Function *F = I->getFunction(); + return F == &Func; + }); + + markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV); + + } else { + Func.addFnAttr("amdgpu-elide-module-lds"); + } + } + + return ModuleScopeReplacement.SGV; + } + + static DenseMap + lowerKernelScopeStructVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + DenseSet const &ModuleScopeVariables, + DenseSet const &KernelsThatAllocateModuleLDS, + GlobalVariable *MaybeModuleScopeStruct) { + + // Create a struct for each kernel for the non-module-scope variables. + DenseMap KernelToReplacement; for (Function &Func : M.functions()) { if (Func.isDeclaration() || !isKernelLDS(&Func)) @@ -927,6 +916,55 @@ public: return I && I->getFunction() == &Func; }); } + return KernelToReplacement; + } + + bool runOnModule(Module &M) override { + CallGraph CG = CallGraph(M); + bool Changed = superAlignLDSGlobals(M); + + Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + + Changed = true; // todo: narrow this down + + // For each kernel, what variables does it access directly or through + // callees + LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + + // For each variable accessed through callees, which kernels access it + VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; + for (auto &K : LDSUsesInfo.indirect_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + for (GlobalVariable *GV : K.second) { + LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); + } + } + + DenseSet ModuleScopeVariables; + DenseSet TableLookupVariables; + DenseSet KernelAccessVariables; + partitionVariablesIntoIndirectStrategies( + M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly, + ModuleScopeVariables, TableLookupVariables, KernelAccessVariables); + + // If the kernel accesses a variable that is going to be stored in the + // module instance through a call then that kernel needs to allocate the + // module instance + DenseSet KernelsThatAllocateModuleLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + ModuleScopeVariables); + DenseSet KernelsThatAllocateTableLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + TableLookupVariables); + + GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables( + M, ModuleScopeVariables, KernelsThatAllocateModuleLDS); + + DenseMap KernelToReplacement = + lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables, + KernelsThatAllocateModuleLDS, + MaybeModuleScopeStruct); // Lower zero cost accesses to the kernel instances just created for (auto &GV : KernelAccessVariables) { @@ -1132,7 +1170,7 @@ private: } template - void replaceLDSVariablesWithStruct( + static void replaceLDSVariablesWithStruct( Module &M, DenseSet const &LDSVarsToTransformArg, LDSVariableReplacement Replacement, PredicateTy Predicate) { LLVMContext &Ctx = M.getContext(); @@ -1190,9 +1228,9 @@ private: } } - void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL, - MDNode *AliasScope, MDNode *NoAlias, - unsigned MaxDepth = 5) { + static void refineUsesAlignmentAndAA(Value *Ptr, Align A, + const DataLayout &DL, MDNode *AliasScope, + MDNode *NoAlias, unsigned MaxDepth = 5) { if (!MaxDepth || (A == 1 && !AliasScope)) return;