[amdgpu][nfc] Extract more functions in LowerModuleLDS, mark more methods static

This commit is contained in:
Jon Chesterfield 2023-03-22 01:23:19 +00:00
parent b9c4b95b11
commit d70e7ea0d1

View File

@ -631,7 +631,7 @@ public:
return tableKernelIndexCache[F];
}
std::vector<Function *> assignLDSKernelIDToEachKernel(
static std::vector<Function *> assignLDSKernelIDToEachKernel(
Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS) {
// Associate kernels in the set with an arbirary but reproducible order and
// annotate them with that order in metadata. This metadata is recognised by
@ -680,168 +680,157 @@ public:
return OrderedKernels;
}
bool runOnModule(Module &M) override {
LLVMContext &Ctx = M.getContext();
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
static void partitionVariablesIntoIndirectStrategies(
Module &M, LDSUsesInfoTy const &LDSUsesInfo,
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
DenseSet<GlobalVariable *> &ModuleScopeVariables,
DenseSet<GlobalVariable *> &TableLookupVariables,
DenseSet<GlobalVariable *> &KernelAccessVariables) {
Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
GlobalVariable *HybridModuleRoot =
LoweringKindLoc != LoweringKind::hybrid
? nullptr
: chooseBestVariableForModuleStrategy(
M.getDataLayout(), LDSToKernelsThatNeedToAccessItIndirectly);
Changed = true; // todo: narrow this down
DenseSet<Function *> const EmptySet;
DenseSet<Function *> const &HybridModuleRootKernels =
HybridModuleRoot
? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
: EmptySet;
// For each kernel, what variables does it access directly or through
// callees
LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
// Each iteration of this loop assigns exactly one global variable to
// exactly one of the implementation strategies.
// For each variable accessed through callees, which kernels access it
VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
for (auto &K : LDSUsesInfo.indirect_access) {
Function *F = K.first;
assert(isKernelLDS(F));
for (GlobalVariable *GV : K.second) {
LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
}
}
GlobalVariable *GV = K.first;
assert(AMDGPU::isLDSVariableToLower(*GV));
assert(K.second.size() != 0);
// Partition variables accessed indirectly into the different strategies
DenseSet<GlobalVariable *> ModuleScopeVariables;
DenseSet<GlobalVariable *> TableLookupVariables;
DenseSet<GlobalVariable *> KernelAccessVariables;
switch (LoweringKindLoc) {
case LoweringKind::module:
ModuleScopeVariables.insert(GV);
break;
{
GlobalVariable *HybridModuleRoot =
LoweringKindLoc != LoweringKind::hybrid
? nullptr
: chooseBestVariableForModuleStrategy(
M.getDataLayout(),
LDSToKernelsThatNeedToAccessItIndirectly);
DenseSet<Function *> const EmptySet;
DenseSet<Function *> const &HybridModuleRootKernels =
HybridModuleRoot
? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
: EmptySet;
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
// Each iteration of this loop assigns exactly one global variable to
// exactly one of the implementation strategies.
GlobalVariable *GV = K.first;
assert(AMDGPU::isLDSVariableToLower(*GV));
assert(K.second.size() != 0);
switch (LoweringKindLoc) {
case LoweringKind::module:
ModuleScopeVariables.insert(GV);
break;
case LoweringKind::table:
TableLookupVariables.insert(GV);
break;
case LoweringKind::kernel:
if (K.second.size() == 1) {
KernelAccessVariables.insert(GV);
} else {
report_fatal_error(
"cannot lower LDS '" + GV->getName() +
"' to kernel access as it is reachable from multiple kernels");
}
break;
case LoweringKind::hybrid: {
if (GV == HybridModuleRoot) {
assert(K.second.size() != 1);
ModuleScopeVariables.insert(GV);
} else if (K.second.size() == 1) {
KernelAccessVariables.insert(GV);
} else if (set_is_subset(K.second, HybridModuleRootKernels)) {
ModuleScopeVariables.insert(GV);
} else {
TableLookupVariables.insert(GV);
}
break;
}
}
}
// All LDS variables accessed indirectly have now been partitioned into
// the distinct lowering strategies.
assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
KernelAccessVariables.size() ==
LDSToKernelsThatNeedToAccessItIndirectly.size());
}
// If the kernel accesses a variable that is going to be stored in the
// module instance through a call then that kernel needs to allocate the
// module instance
DenseSet<Function *> KernelsThatAllocateModuleLDS =
kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
ModuleScopeVariables);
DenseSet<Function *> KernelsThatAllocateTableLDS =
kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
TableLookupVariables);
GlobalVariable *MaybeModuleScopeStruct = nullptr;
if (!ModuleScopeVariables.empty()) {
LDSVariableReplacement ModuleScopeReplacement =
createLDSVariableReplacement(M, "llvm.amdgcn.module.lds",
ModuleScopeVariables);
MaybeModuleScopeStruct = ModuleScopeReplacement.SGV;
appendToCompilerUsed(M,
{static_cast<GlobalValue *>(
ConstantExpr::getPointerBitCastOrAddrSpaceCast(
cast<Constant>(ModuleScopeReplacement.SGV),
Type::getInt8PtrTy(Ctx)))});
// module.lds will be allocated at zero in any kernel that allocates it
recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0);
// historic
removeLocalVarsFromUsedLists(M, ModuleScopeVariables);
// Replace all uses of module scope variable from non-kernel functions
replaceLDSVariablesWithStruct(
M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
Instruction *I = dyn_cast<Instruction>(U.getUser());
if (!I) {
return false;
}
Function *F = I->getFunction();
return !isKernelLDS(F);
});
// Replace uses of module scope variable from kernel functions that
// allocate the module scope variable, otherwise leave them unchanged
// Record on each kernel whether the module scope global is used by it
LLVMContext &Ctx = M.getContext();
IRBuilder<> Builder(Ctx);
for (Function &Func : M.functions()) {
if (Func.isDeclaration() || !isKernelLDS(&Func))
continue;
if (KernelsThatAllocateModuleLDS.contains(&Func)) {
replaceLDSVariablesWithStruct(
M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
Instruction *I = dyn_cast<Instruction>(U.getUser());
if (!I) {
return false;
}
Function *F = I->getFunction();
return F == &Func;
});
markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV);
case LoweringKind::table:
TableLookupVariables.insert(GV);
break;
case LoweringKind::kernel:
if (K.second.size() == 1) {
KernelAccessVariables.insert(GV);
} else {
Func.addFnAttr("amdgpu-elide-module-lds");
report_fatal_error(
"cannot lower LDS '" + GV->getName() +
"' to kernel access as it is reachable from multiple kernels");
}
break;
case LoweringKind::hybrid: {
if (GV == HybridModuleRoot) {
assert(K.second.size() != 1);
ModuleScopeVariables.insert(GV);
} else if (K.second.size() == 1) {
KernelAccessVariables.insert(GV);
} else if (set_is_subset(K.second, HybridModuleRootKernels)) {
ModuleScopeVariables.insert(GV);
} else {
TableLookupVariables.insert(GV);
}
break;
}
}
}
// Create a struct for each kernel for the non-module-scope variables
// All LDS variables accessed indirectly have now been partitioned into
// the distinct lowering strategies.
assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
KernelAccessVariables.size() ==
LDSToKernelsThatNeedToAccessItIndirectly.size());
}
static GlobalVariable *lowerModuleScopeStructVariables(
Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables,
DenseSet<Function *> const &KernelsThatAllocateModuleLDS) {
// Create a struct to hold the ModuleScopeVariables
// Replace all uses of those variables from non-kernel functions with the
// new struct instance Replace only the uses from kernel functions that will
// allocate this instance. That is a space optimisation - kernels that use a
// subset of the module scope struct and do not need to allocate it for
// indirect calls will only allocate the subset they use (they do so as part
// of the per-kernel lowering).
if (ModuleScopeVariables.empty()) {
return nullptr;
}
LLVMContext &Ctx = M.getContext();
LDSVariableReplacement ModuleScopeReplacement =
createLDSVariableReplacement(M, "llvm.amdgcn.module.lds",
ModuleScopeVariables);
appendToCompilerUsed(M, {static_cast<GlobalValue *>(
ConstantExpr::getPointerBitCastOrAddrSpaceCast(
cast<Constant>(ModuleScopeReplacement.SGV),
Type::getInt8PtrTy(Ctx)))});
// module.lds will be allocated at zero in any kernel that allocates it
recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0);
// historic
removeLocalVarsFromUsedLists(M, ModuleScopeVariables);
// Replace all uses of module scope variable from non-kernel functions
replaceLDSVariablesWithStruct(
M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
Instruction *I = dyn_cast<Instruction>(U.getUser());
if (!I) {
return false;
}
Function *F = I->getFunction();
return !isKernelLDS(F);
});
// Replace uses of module scope variable from kernel functions that
// allocate the module scope variable, otherwise leave them unchanged
// Record on each kernel whether the module scope global is used by it
IRBuilder<> Builder(Ctx);
for (Function &Func : M.functions()) {
if (Func.isDeclaration() || !isKernelLDS(&Func))
continue;
if (KernelsThatAllocateModuleLDS.contains(&Func)) {
replaceLDSVariablesWithStruct(
M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
Instruction *I = dyn_cast<Instruction>(U.getUser());
if (!I) {
return false;
}
Function *F = I->getFunction();
return F == &Func;
});
markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV);
} else {
Func.addFnAttr("amdgpu-elide-module-lds");
}
}
return ModuleScopeReplacement.SGV;
}
static DenseMap<Function *, LDSVariableReplacement>
lowerKernelScopeStructVariables(
Module &M, LDSUsesInfoTy &LDSUsesInfo,
DenseSet<GlobalVariable *> const &ModuleScopeVariables,
DenseSet<Function *> const &KernelsThatAllocateModuleLDS,
GlobalVariable *MaybeModuleScopeStruct) {
// Create a struct for each kernel for the non-module-scope variables.
DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
for (Function &Func : M.functions()) {
if (Func.isDeclaration() || !isKernelLDS(&Func))
@ -927,6 +916,55 @@ public:
return I && I->getFunction() == &Func;
});
}
return KernelToReplacement;
}
bool runOnModule(Module &M) override {
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
Changed = true; // todo: narrow this down
// For each kernel, what variables does it access directly or through
// callees
LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
// For each variable accessed through callees, which kernels access it
VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
for (auto &K : LDSUsesInfo.indirect_access) {
Function *F = K.first;
assert(isKernelLDS(F));
for (GlobalVariable *GV : K.second) {
LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
}
}
DenseSet<GlobalVariable *> ModuleScopeVariables;
DenseSet<GlobalVariable *> TableLookupVariables;
DenseSet<GlobalVariable *> KernelAccessVariables;
partitionVariablesIntoIndirectStrategies(
M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
ModuleScopeVariables, TableLookupVariables, KernelAccessVariables);
// If the kernel accesses a variable that is going to be stored in the
// module instance through a call then that kernel needs to allocate the
// module instance
DenseSet<Function *> KernelsThatAllocateModuleLDS =
kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
ModuleScopeVariables);
DenseSet<Function *> KernelsThatAllocateTableLDS =
kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
TableLookupVariables);
GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables(
M, ModuleScopeVariables, KernelsThatAllocateModuleLDS);
DenseMap<Function *, LDSVariableReplacement> KernelToReplacement =
lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables,
KernelsThatAllocateModuleLDS,
MaybeModuleScopeStruct);
// Lower zero cost accesses to the kernel instances just created
for (auto &GV : KernelAccessVariables) {
@ -1132,7 +1170,7 @@ private:
}
template <typename PredicateTy>
void replaceLDSVariablesWithStruct(
static void replaceLDSVariablesWithStruct(
Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg,
LDSVariableReplacement Replacement, PredicateTy Predicate) {
LLVMContext &Ctx = M.getContext();
@ -1190,9 +1228,9 @@ private:
}
}
void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
MDNode *AliasScope, MDNode *NoAlias,
unsigned MaxDepth = 5) {
static void refineUsesAlignmentAndAA(Value *Ptr, Align A,
const DataLayout &DL, MDNode *AliasScope,
MDNode *NoAlias, unsigned MaxDepth = 5) {
if (!MaxDepth || (A == 1 && !AliasScope))
return;