diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h index f1fbc287d017..695f864b5ec6 100644 --- a/openmp/libomptarget/include/device.h +++ b/openmp/libomptarget/include/device.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include "ExclusiveAccess.h" #include "omptarget.h" #include "rtl.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" // Forward declarations. @@ -43,6 +45,22 @@ enum kmp_target_offload_kind { }; typedef enum kmp_target_offload_kind kmp_target_offload_kind_t; +/// Information about shadow pointers. +struct ShadowPtrInfoTy { + void **HstPtrAddr = nullptr; + void *HstPtrVal = nullptr; + void **TgtPtrAddr = nullptr; + void *TgtPtrVal = nullptr; + + bool operator==(const ShadowPtrInfoTy &Other) const { + return HstPtrAddr == Other.HstPtrAddr; + } +}; + +inline bool operator<(const ShadowPtrInfoTy &lhs, const ShadowPtrInfoTy &rhs) { + return lhs.HstPtrAddr < rhs.HstPtrAddr; +} + /// Map between host data and target data. struct HostDataToTargetTy { const uintptr_t HstPtrBase; // host info. @@ -60,8 +78,7 @@ private: struct StatesTy { StatesTy(uint64_t DRC, uint64_t HRC) - : DynRefCount(DRC), HoldRefCount(HRC), - MayContainAttachedPointers(false) {} + : DynRefCount(DRC), HoldRefCount(HRC) {} /// The dynamic reference count is the standard reference count as of OpenMP /// 4.5. The hold reference count is an OpenMP extension for the sake of /// OpenACC support. @@ -80,17 +97,10 @@ private: uint64_t DynRefCount; uint64_t HoldRefCount; - /// Boolean flag to remember if any subpart of the mapped region might be - /// an attached pointer. - bool MayContainAttachedPointers; + /// A map of shadow pointers associated with this entry, the keys are host + /// pointer addresses to identify stale entries. + llvm::SmallSet ShadowPtrInfos; - /// This mutex will be locked when data movement is issued. For targets that - /// doesn't support async data movement, this mutex can guarantee that after - /// it is released, memory region on the target is update to date. For - /// targets that support async data movement, this can guarantee that data - /// movement has been issued. This mutex *must* be locked right before - /// releasing the mapping table lock. - std::mutex UpdateMtx; /// Pointer to the event corresponding to the data update of this map. /// Note: At present this event is created when the first data transfer from /// host to device is issued, and only being used for H2D. It is not used @@ -222,16 +232,41 @@ public: return ThisRefCount == 1; } - void setMayContainAttachedPointers() const { - States->MayContainAttachedPointers = true; - } - bool getMayContainAttachedPointers() const { - return States->MayContainAttachedPointers; + /// Add the shadow pointer info \p ShadowPtrInfo to this entry but only if the + /// the target ptr value was not already present in the existing set of shadow + /// pointers. Return true if something was added. + bool addShadowPointer(const ShadowPtrInfoTy &ShadowPtrInfo) const { + auto Pair = States->ShadowPtrInfos.insert(ShadowPtrInfo); + if (Pair.second) + return true; + // Check for a stale entry, if found, replace the old one. + if ((*Pair.first).TgtPtrVal == ShadowPtrInfo.TgtPtrVal) + return false; + States->ShadowPtrInfos.erase(ShadowPtrInfo); + return addShadowPointer(ShadowPtrInfo); } - void lock() const { States->UpdateMtx.lock(); } + /// Apply \p CB to all shadow pointers of this entry. Returns OFFLOAD_FAIL if + /// \p CB returned OFFLOAD_FAIL for any of them, otherwise this returns + /// OFFLOAD_SUCCESS. The entry is locked for this operation. + template int foreachShadowPointerInfo(CBTy CB) const { + for (auto &It : States->ShadowPtrInfos) + if (CB(It) == OFFLOAD_FAIL) + return OFFLOAD_FAIL; + return OFFLOAD_SUCCESS; + } - void unlock() const { States->UpdateMtx.unlock(); } + /// Lock this entry for exclusive access. Ensure to get exclusive access to + /// HDTTMap first! + void lock() const { Mtx.lock(); } + + /// Unlock this entry to allow other threads inspecting it. + void unlock() const { Mtx.unlock(); } + +private: + // Mutex that needs to be held before the entry is inspected or modified. The + // HDTTMap mutex needs to be held before trying to lock any HDTT Entry. + mutable std::mutex Mtx; }; /// Wrapper around the HostDataToTargetTy to be used in the HDTT map. In @@ -243,6 +278,7 @@ struct HostDataToTargetMapKeyTy { uintptr_t KeyValue; HostDataToTargetMapKeyTy(void *Key) : KeyValue(uintptr_t(Key)) {} + HostDataToTargetMapKeyTy(uintptr_t Key) : KeyValue(Key) {} HostDataToTargetMapKeyTy(HostDataToTargetTy *HDTT) : KeyValue(HDTT->HstPtrBegin), HDTT(HDTT) {} HostDataToTargetTy *HDTT; @@ -260,6 +296,78 @@ inline bool operator<(const HostDataToTargetMapKeyTy &LHS, return LHS.KeyValue < RHS.KeyValue; } +/// This struct will be returned by \p DeviceTy::getTargetPointer which provides +/// more data than just a target pointer. A TargetPointerResultTy that has a non +/// null Entry owns the entry. As long as the TargetPointerResultTy (TPR) exists +/// the entry is locked. To give up ownership without destroying the TPR use the +/// reset() function. +struct TargetPointerResultTy { + struct FlagTy { + /// If the map table entry is just created + unsigned IsNewEntry : 1; + /// If the pointer is actually a host pointer (when unified memory enabled) + unsigned IsHostPointer : 1; + /// If the pointer is present in the mapping table. + unsigned IsPresent : 1; + /// Flag indicating that this was the last user of the entry and the ref + /// count is now 0. + unsigned IsLast : 1; + } Flags = {0, 0, 0, 0}; + + TargetPointerResultTy(const TargetPointerResultTy &) = delete; + TargetPointerResultTy &operator=(const TargetPointerResultTy &TPR) = delete; + TargetPointerResultTy() {} + + TargetPointerResultTy(FlagTy Flags, HostDataToTargetTy *Entry, + void *TargetPointer) + : Flags(Flags), TargetPointer(TargetPointer), Entry(Entry) { + if (Entry) + Entry->lock(); + } + + TargetPointerResultTy(TargetPointerResultTy &&TPR) + : Flags(TPR.Flags), TargetPointer(TPR.TargetPointer), Entry(TPR.Entry) { + TPR.Entry = nullptr; + } + + TargetPointerResultTy &operator=(TargetPointerResultTy &&TPR) { + if (&TPR != this) { + std::swap(Flags, TPR.Flags); + std::swap(Entry, TPR.Entry); + std::swap(TargetPointer, TPR.TargetPointer); + } + return *this; + } + + ~TargetPointerResultTy() { + if (Entry) + Entry->unlock(); + } + + bool isPresent() const { return Flags.IsPresent; } + + bool isHostPointer() const { return Flags.IsHostPointer; } + + /// The corresponding target pointer + void *TargetPointer = nullptr; + + HostDataToTargetTy *getEntry() const { return Entry; } + void setEntry(HostDataToTargetTy *HDTTT, + HostDataToTargetTy *OwnedTPR = nullptr) { + if (Entry) + Entry->unlock(); + Entry = HDTTT; + if (Entry && Entry != OwnedTPR) + Entry->lock(); + } + + void reset() { *this = TargetPointerResultTy(); } + +private: + /// The corresponding map table entry which is stable. + HostDataToTargetTy *Entry = nullptr; +}; + struct LookupResult { struct { unsigned IsContained : 1; @@ -267,43 +375,11 @@ struct LookupResult { unsigned ExtendsAfter : 1; } Flags; - /// The corresponding map table entry which is stable. - HostDataToTargetTy *Entry = nullptr; + LookupResult() : Flags({0, 0, 0}), TPR() {} - LookupResult() : Flags({0, 0, 0}), Entry() {} + TargetPointerResultTy TPR; }; -/// This struct will be returned by \p DeviceTy::getTargetPointer which provides -/// more data than just a target pointer. -struct TargetPointerResultTy { - struct { - /// If the map table entry is just created - unsigned IsNewEntry : 1; - /// If the pointer is actually a host pointer (when unified memory enabled) - unsigned IsHostPointer : 1; - /// If the pointer is present in the mapping table. - unsigned IsPresent : 1; - } Flags = {0, 0, 0}; - - bool isPresent() const { return Flags.IsPresent; } - - bool isHostPointer() const { return Flags.IsHostPointer; } - - /// The corresponding map table entry which is stable. - HostDataToTargetTy *Entry = nullptr; - - /// The corresponding target pointer - void *TargetPointer = nullptr; -}; - -/// Map for shadow pointers -struct ShadowPtrValTy { - void *HstPtrVal; - void *TgtPtrAddr; - void *TgtPtrVal; -}; -typedef std::map ShadowPtrListTy; - /// struct PendingCtorDtorListsTy { std::list PendingCtors; @@ -336,9 +412,7 @@ struct DeviceTy { PendingCtorsDtorsPerLibrary PendingCtorsDtors; - ShadowPtrListTy ShadowPtrMap; - - std::mutex PendingGlobalsMtx, ShadowMtx; + std::mutex PendingGlobalsMtx; DeviceTy(RTLInfoTy *RTL); // DeviceTy is not copyable @@ -353,7 +427,8 @@ struct DeviceTy { /// Lookup the mapping of \p HstPtrBegin in \p HDTTMap. The accessor ensures /// exclusive access to the HDTT map. LookupResult lookupMapping(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, - int64_t Size); + int64_t Size, + HostDataToTargetTy *OwnedTPR = nullptr); /// Get the target pointer based on host pointer begin and base. If the /// mapping already exists, the target pointer will be returned directly. In @@ -365,12 +440,13 @@ struct DeviceTy { /// - Data allocation failed; /// - The user tried to do an illegal mapping; /// - Data transfer issue fails. - TargetPointerResultTy - getTargetPointer(void *HstPtrBegin, void *HstPtrBase, int64_t Size, - map_var_info_t HstPtrName, bool HasFlagTo, - bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount, - bool HasCloseModifier, bool HasPresentModifier, - bool HasHoldModifier, AsyncInfoTy &AsyncInfo); + TargetPointerResultTy getTargetPointer( + HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, void *HstPtrBase, + int64_t Size, map_var_info_t HstPtrName, bool HasFlagTo, + bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount, + bool HasCloseModifier, bool HasPresentModifier, bool HasHoldModifier, + AsyncInfoTy &AsyncInfo, HostDataToTargetTy *OwnedTPR = nullptr, + bool ReleaseHDTTMap = true); /// Return the target pointer for \p HstPtrBegin in \p HDTTMap. The accessor /// ensures exclusive access to the HDTT map. @@ -388,10 +464,9 @@ struct DeviceTy { /// - \p FromDataEnd tracks the number of threads referencing the entry at /// targetDataEnd for delayed deletion purpose. [[nodiscard]] TargetPointerResultTy - getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, - bool UpdateRefCount, bool UseHoldRefCount, bool &IsHostPtr, - bool MustContain = false, bool ForceDelete = false, - bool FromDataEnd = false); + getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool UpdateRefCount, + bool UseHoldRefCount, bool MustContain = false, + bool ForceDelete = false, bool FromDataEnd = false); /// Remove the \p Entry from the data map. Expect the entry's total reference /// count to be zero and the caller thread to be the last one using it. \p @@ -436,10 +511,12 @@ struct DeviceTy { // synchronous. // Copy data from host to device int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, - AsyncInfoTy &AsyncInfo); + AsyncInfoTy &AsyncInfo, + HostDataToTargetTy *Entry = nullptr); // Copy data from device back to host int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, - AsyncInfoTy &AsyncInfo); + AsyncInfoTy &AsyncInfo, + HostDataToTargetTy *Entry = nullptr); // Copy data from current device to destination device directly int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, int64_t Size, AsyncInfoTy &AsyncInfo); diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index 47c9d5e2f1ef..2dfe075b8e74 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -116,16 +116,13 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) { } DeviceTy &Device = *PM->Devices[DeviceNum]; - bool IsLast; // not used - bool IsHostPtr; // omp_target_is_present tests whether a host pointer refers to storage that // is mapped to a given device. However, due to the lack of the storage size, // only check 1 byte. Cannot set size 0 which checks whether the pointer (zero // lengh array) is mapped instead of the referred storage. - TargetPointerResultTy TPR = - Device.getTgtPtrBegin(const_cast(Ptr), 1, IsLast, - /*UpdateRefCount=*/false, - /*UseHoldRefCount=*/false, IsHostPtr); + TargetPointerResultTy TPR = Device.getTgtPtrBegin(const_cast(Ptr), 1, + /*UpdateRefCount=*/false, + /*UseHoldRefCount=*/false); int Rc = TPR.isPresent(); DP("Call to omp_target_is_present returns %d\n", Rc); return Rc; @@ -360,13 +357,10 @@ EXTERN void *omp_get_mapped_ptr(const void *Ptr, int DeviceNum) { return nullptr; } - bool IsLast = false; - bool IsHostPtr = false; auto &Device = *PM->Devices[DeviceNum]; - TargetPointerResultTy TPR = - Device.getTgtPtrBegin(const_cast(Ptr), 1, IsLast, - /*UpdateRefCount=*/false, - /*UseHoldRefCount=*/false, IsHostPtr); + TargetPointerResultTy TPR = Device.getTgtPtrBegin(const_cast(Ptr), 1, + /*UpdateRefCount=*/false, + /*UseHoldRefCount=*/false); if (!TPR.isPresent()) { DP("Ptr " DPxMOD "is not present on device %d, returning nullptr.\n", DPxPTR(Ptr), DeviceNum); diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index 1f5d5a23371d..09c8e808db46 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -51,8 +52,7 @@ int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device, DeviceTy::DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), - HasPendingGlobals(false), PendingCtorsDtors(), ShadowPtrMap(), - PendingGlobalsMtx(), ShadowMtx() {} + HasPendingGlobals(false), PendingCtorsDtors(), PendingGlobalsMtx() {} DeviceTy::~DeviceTy() { if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)) @@ -69,6 +69,7 @@ int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) { auto It = HDTTMap->find(HstPtrBegin); if (It != HDTTMap->end()) { HostDataToTargetTy &HDTT = *It->HDTT; + std::lock_guard LG(HDTT); // Mapping already exists bool IsValid = HDTT.HstPtrEnd == (uintptr_t)HstPtrBegin + Size && HDTT.TgtPtrBegin == (uintptr_t)TgtPtrBegin; @@ -109,39 +110,41 @@ int DeviceTy::disassociatePtr(void *HstPtrBegin) { HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor(); auto It = HDTTMap->find(HstPtrBegin); - if (It != HDTTMap->end()) { - HostDataToTargetTy &HDTT = *It->HDTT; - // Mapping exists - if (HDTT.getHoldRefCount()) { - // This is based on OpenACC 3.1, sec 3.2.33 "acc_unmap_data", L3656-3657: - // "It is an error to call acc_unmap_data if the structured reference - // count for the pointer is not zero." - REPORT("Trying to disassociate a pointer with a non-zero hold reference " - "count\n"); - } else if (HDTT.isDynRefCountInf()) { - DP("Association found, removing it\n"); - void *Event = HDTT.getEvent(); - delete &HDTT; - if (Event) - destroyEvent(Event); - HDTTMap->erase(It); - - // Notify the plugin about the unmapped memory. - return notifyDataUnmapped(HstPtrBegin); - } else { - REPORT("Trying to disassociate a pointer which was not mapped via " - "omp_target_associate_ptr\n"); - } - } else { + if (It == HDTTMap->end()) { REPORT("Association not found\n"); + return OFFLOAD_FAIL; + } + // Mapping exists + HostDataToTargetTy &HDTT = *It->HDTT; + std::lock_guard LG(HDTT); + + if (HDTT.getHoldRefCount()) { + // This is based on OpenACC 3.1, sec 3.2.33 "acc_unmap_data", L3656-3657: + // "It is an error to call acc_unmap_data if the structured reference + // count for the pointer is not zero." + REPORT("Trying to disassociate a pointer with a non-zero hold reference " + "count\n"); + return OFFLOAD_FAIL; } - // Mapping not found + if (HDTT.isDynRefCountInf()) { + DP("Association found, removing it\n"); + void *Event = HDTT.getEvent(); + delete &HDTT; + if (Event) + destroyEvent(Event); + HDTTMap->erase(It); + return OFFLOAD_SUCCESS; + } + + REPORT("Trying to disassociate a pointer which was not mapped via " + "omp_target_associate_ptr\n"); return OFFLOAD_FAIL; } LookupResult DeviceTy::lookupMapping(HDTTMapAccessorTy &HDTTMap, - void *HstPtrBegin, int64_t Size) { + void *HstPtrBegin, int64_t Size, + HostDataToTargetTy *OwnedTPR) { uintptr_t HP = (uintptr_t)HstPtrBegin; LookupResult LR; @@ -159,42 +162,43 @@ LookupResult DeviceTy::lookupMapping(HDTTMapAccessorTy &HDTTMap, // upper_bound satisfies // std::prev(upper)->HDTT.HstPtrBegin <= hp < upper->HDTT.HstPtrBegin if (Upper != HDTTMap->begin()) { - LR.Entry = std::prev(Upper)->HDTT; - auto &HT = *LR.Entry; + LR.TPR.setEntry(std::prev(Upper)->HDTT, OwnedTPR); // the left side of extended address range is satisified. - // hp >= HT.HstPtrBegin || hp >= HT.HstPtrBase - LR.Flags.IsContained = HP < HT.HstPtrEnd || HP < HT.HstPtrBase; + // hp >= LR.TPR.getEntry()->HstPtrBegin || hp >= + // LR.TPR.getEntry()->HstPtrBase + LR.Flags.IsContained = HP < LR.TPR.getEntry()->HstPtrEnd || + HP < LR.TPR.getEntry()->HstPtrBase; } if (!LR.Flags.IsContained && Upper != HDTTMap->end()) { - LR.Entry = Upper->HDTT; - auto &HT = *LR.Entry; + LR.TPR.setEntry(Upper->HDTT, OwnedTPR); // the right side of extended address range is satisified. - // hp < HT.HstPtrEnd || hp < HT.HstPtrBase - LR.Flags.IsContained = HP >= HT.HstPtrBase; + // hp < LR.TPR.getEntry()->HstPtrEnd || hp < LR.TPR.getEntry()->HstPtrBase + LR.Flags.IsContained = HP >= LR.TPR.getEntry()->HstPtrBase; } } else { // check the left bin if (Upper != HDTTMap->begin()) { - LR.Entry = std::prev(Upper)->HDTT; - auto &HT = *LR.Entry; + LR.TPR.setEntry(std::prev(Upper)->HDTT, OwnedTPR); // Is it contained? - LR.Flags.IsContained = HP >= HT.HstPtrBegin && HP < HT.HstPtrEnd && - (HP + Size) <= HT.HstPtrEnd; + LR.Flags.IsContained = HP >= LR.TPR.getEntry()->HstPtrBegin && + HP < LR.TPR.getEntry()->HstPtrEnd && + (HP + Size) <= LR.TPR.getEntry()->HstPtrEnd; // Does it extend beyond the mapped region? - LR.Flags.ExtendsAfter = HP < HT.HstPtrEnd && (HP + Size) > HT.HstPtrEnd; + LR.Flags.ExtendsAfter = HP < LR.TPR.getEntry()->HstPtrEnd && + (HP + Size) > LR.TPR.getEntry()->HstPtrEnd; } // check the right bin if (!(LR.Flags.IsContained || LR.Flags.ExtendsAfter) && Upper != HDTTMap->end()) { - LR.Entry = Upper->HDTT; - auto &HT = *LR.Entry; + LR.TPR.setEntry(Upper->HDTT, OwnedTPR); // Does it extend into an already mapped region? - LR.Flags.ExtendsBefore = - HP < HT.HstPtrBegin && (HP + Size) > HT.HstPtrBegin; + LR.Flags.ExtendsBefore = HP < LR.TPR.getEntry()->HstPtrBegin && + (HP + Size) > LR.TPR.getEntry()->HstPtrBegin; // Does it extend beyond the mapped region? - LR.Flags.ExtendsAfter = HP < HT.HstPtrEnd && (HP + Size) > HT.HstPtrEnd; + LR.Flags.ExtendsAfter = HP < LR.TPR.getEntry()->HstPtrEnd && + (HP + Size) > LR.TPR.getEntry()->HstPtrEnd; } if (LR.Flags.ExtendsBefore) { @@ -211,19 +215,19 @@ LookupResult DeviceTy::lookupMapping(HDTTMapAccessorTy &HDTTMap, } TargetPointerResultTy DeviceTy::getTargetPointer( - void *HstPtrBegin, void *HstPtrBase, int64_t Size, - map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways, + HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, void *HstPtrBase, + int64_t Size, map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier, - bool HasPresentModifier, bool HasHoldModifier, AsyncInfoTy &AsyncInfo) { - HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor(); + bool HasPresentModifier, bool HasHoldModifier, AsyncInfoTy &AsyncInfo, + HostDataToTargetTy *OwnedTPR, bool ReleaseHDTTMap) { - void *TargetPointer = nullptr; - bool IsHostPtr = false; - bool IsPresent = true; - bool IsNew = false; + LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size, OwnedTPR); + LR.TPR.Flags.IsPresent = true; - LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size); - auto *Entry = LR.Entry; + // Release the mapping table lock only after the entry is locked by + // attaching it to TPR. Once TPR is destroyed it will release the lock + // on entry. If it is returned the lock will move to the returned object. + // If LR.Entry is already owned/locked we avoid trying to lock it again. // Check if the pointer is contained. // If a variable is mapped to the device manually by the user - which would @@ -231,38 +235,38 @@ TargetPointerResultTy DeviceTy::getTargetPointer( // device address is returned even under unified memory conditions. if (LR.Flags.IsContained || ((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && IsImplicit)) { - auto &HT = *LR.Entry; const char *RefCountAction; if (UpdateRefCount) { // After this, reference count >= 1. If the reference count was 0 but the // entry was still there we can reuse the data on the device and avoid a // new submission. - HT.incRefCount(HasHoldModifier); + LR.TPR.getEntry()->incRefCount(HasHoldModifier); RefCountAction = " (incremented)"; } else { // It might have been allocated with the parent, but it's still new. - IsNew = HT.getTotalRefCount() == 1; + LR.TPR.Flags.IsNewEntry = LR.TPR.getEntry()->getTotalRefCount() == 1; RefCountAction = " (update suppressed)"; } const char *DynRefCountAction = HasHoldModifier ? "" : RefCountAction; const char *HoldRefCountAction = HasHoldModifier ? RefCountAction : ""; - uintptr_t Ptr = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + uintptr_t Ptr = LR.TPR.getEntry()->TgtPtrBegin + + ((uintptr_t)HstPtrBegin - LR.TPR.getEntry()->HstPtrBegin); INFO(OMP_INFOTYPE_MAPPING_EXISTS, DeviceID, "Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s, Name=%s\n", (IsImplicit ? " (implicit)" : ""), DPxPTR(HstPtrBegin), DPxPTR(Ptr), - Size, HT.dynRefCountToStr().c_str(), DynRefCountAction, - HT.holdRefCountToStr().c_str(), HoldRefCountAction, + Size, LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction, + LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction, (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown"); - TargetPointer = (void *)Ptr; + LR.TPR.TargetPointer = (void *)Ptr; } else if ((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && !IsImplicit) { // Explicit extension of mapped data - not allowed. MESSAGE("explicit extension not allowed: host address specified is " DPxMOD " (%" PRId64 " bytes), but device allocation maps to host at " DPxMOD " (%" PRId64 " bytes)", - DPxPTR(HstPtrBegin), Size, DPxPTR(Entry->HstPtrBegin), - Entry->HstPtrEnd - Entry->HstPtrBegin); + DPxPTR(HstPtrBegin), Size, DPxPTR(LR.TPR.getEntry()->HstPtrBegin), + LR.TPR.getEntry()->HstPtrEnd - LR.TPR.getEntry()->HstPtrBegin); if (HasPresentModifier) MESSAGE("device mapping required by 'present' map type modifier does not " "exist for host address " DPxMOD " (%" PRId64 " bytes)", @@ -279,9 +283,9 @@ TargetPointerResultTy DeviceTy::getTargetPointer( DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared " "memory\n", DPxPTR((uintptr_t)HstPtrBegin), Size); - IsPresent = false; - IsHostPtr = true; - TargetPointer = HstPtrBegin; + LR.TPR.Flags.IsPresent = false; + LR.TPR.Flags.IsHostPointer = true; + LR.TPR.TargetPointer = HstPtrBegin; } } else if (HasPresentModifier) { DP("Mapping required by 'present' map type modifier does not exist for " @@ -292,22 +296,25 @@ TargetPointerResultTy DeviceTy::getTargetPointer( DPxPTR(HstPtrBegin), Size); } else if (Size) { // If it is not contained and Size > 0, we should create a new entry for it. - IsNew = true; + LR.TPR.Flags.IsNewEntry = true; uintptr_t Ptr = (uintptr_t)allocData(Size, HstPtrBegin); - Entry = HDTTMap - ->emplace(new HostDataToTargetTy( - (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin, - (uintptr_t)HstPtrBegin + Size, Ptr, HasHoldModifier, - HstPtrName)) - .first->HDTT; + // Release the mapping table lock only after the entry is locked by + // attaching it to TPR. + LR.TPR.setEntry(HDTTMap + ->emplace(new HostDataToTargetTy( + (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin, + (uintptr_t)HstPtrBegin + Size, Ptr, HasHoldModifier, + HstPtrName)) + .first->HDTT); INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID, "Creating new map entry with HstPtrBase=" DPxMOD ", HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", Size=%ld, " "DynRefCount=%s, HoldRefCount=%s, Name=%s\n", DPxPTR(HstPtrBase), DPxPTR(HstPtrBegin), DPxPTR(Ptr), Size, - Entry->dynRefCountToStr().c_str(), Entry->holdRefCountToStr().c_str(), + LR.TPR.getEntry()->dynRefCountToStr().c_str(), + LR.TPR.getEntry()->holdRefCountToStr().c_str(), (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown"); - TargetPointer = (void *)Ptr; + LR.TPR.TargetPointer = (void *)Ptr; // Notify the plugin about the new mapping. if (notifyDataMapped(HstPtrBegin, Size)) @@ -316,42 +323,41 @@ TargetPointerResultTy DeviceTy::getTargetPointer( nullptr /* TargetPointer */}; } else { // This entry is not present and we did not create a new entry for it. - IsPresent = false; + LR.TPR.Flags.IsPresent = false; } + // All mapping table modifications have been made. If the user requested it we + // give up the lock. + if (ReleaseHDTTMap) + HDTTMap.destroy(); + // If the target pointer is valid, and we need to transfer data, issue the // data transfer. - if (TargetPointer && !IsHostPtr && HasFlagTo && (IsNew || HasFlagAlways) && - Size != 0) { - // Lock the entry before releasing the mapping table lock such that another - // thread that could issue data movement will get the right result. - std::lock_guard LG(*Entry); - // Release the mapping table lock right after the entry is locked. - HDTTMap.destroy(); - + if (LR.TPR.TargetPointer && !LR.TPR.Flags.IsHostPointer && HasFlagTo && + (LR.TPR.Flags.IsNewEntry || HasFlagAlways) && Size != 0) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", Size, - DPxPTR(HstPtrBegin), DPxPTR(TargetPointer)); + DPxPTR(HstPtrBegin), DPxPTR(LR.TPR.TargetPointer)); - int Ret = submitData(TargetPointer, HstPtrBegin, Size, AsyncInfo); + int Ret = submitData(LR.TPR.TargetPointer, HstPtrBegin, Size, AsyncInfo, + LR.TPR.getEntry()); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); // We will also return nullptr if the data movement fails because that // pointer points to a corrupted memory region so it doesn't make any // sense to continue to use it. - TargetPointer = nullptr; - } else if (Entry->addEventIfNecessary(*this, AsyncInfo) != OFFLOAD_SUCCESS) + LR.TPR.TargetPointer = nullptr; + } else if (LR.TPR.getEntry()->addEventIfNecessary(*this, AsyncInfo) != + OFFLOAD_SUCCESS) return {{false /* IsNewEntry */, false /* IsHostPointer */}, nullptr /* Entry */, nullptr /* TargetPointer */}; } else { - // Release the mapping table lock directly. - HDTTMap.destroy(); // If not a host pointer and no present modifier, we need to wait for the // event if it exists. // Note: Entry might be nullptr because of zero length array section. - if (Entry && !IsHostPtr && !HasPresentModifier) { - std::lock_guard LG(*Entry); - void *Event = Entry->getEvent(); + if (LR.TPR.getEntry() && !LR.TPR.Flags.IsHostPointer && + !HasPresentModifier) { + void *Event = LR.TPR.getEntry()->getEvent(); if (Event) { int Ret = waitEvent(Event, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { @@ -366,31 +372,28 @@ TargetPointerResultTy DeviceTy::getTargetPointer( } } - return {{IsNew, IsHostPtr, IsPresent}, Entry, TargetPointer}; + return std::move(LR.TPR); } TargetPointerResultTy -DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, - bool UpdateRefCount, bool UseHoldRefCount, - bool &IsHostPtr, bool MustContain, bool ForceDelete, - bool FromDataEnd) { +DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool UpdateRefCount, + bool UseHoldRefCount, bool MustContain, + bool ForceDelete, bool FromDataEnd) { HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor(); - void *TargetPointer = NULL; - bool IsNew = false; - bool IsPresent = true; - IsHostPtr = false; - IsLast = false; LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size); + LR.TPR.Flags.IsPresent = true; + if (LR.Flags.IsContained || (!MustContain && (LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter))) { - auto &HT = *LR.Entry; - IsLast = HT.decShouldRemove(UseHoldRefCount, ForceDelete); + LR.TPR.Flags.IsLast = + LR.TPR.getEntry()->decShouldRemove(UseHoldRefCount, ForceDelete); if (ForceDelete) { - HT.resetRefCount(UseHoldRefCount); - assert(IsLast == HT.decShouldRemove(UseHoldRefCount) && + LR.TPR.getEntry()->resetRefCount(UseHoldRefCount); + assert(LR.TPR.Flags.IsLast == + LR.TPR.getEntry()->decShouldRemove(UseHoldRefCount) && "expected correct IsLast prediction for reset"); } @@ -400,32 +403,34 @@ DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, // for it. Thus, we must track every query on targetDataEnds to ensure only // the last thread that holds a reference to an entry actually deletes it. if (FromDataEnd) - HT.incDataEndThreadCount(); + LR.TPR.getEntry()->incDataEndThreadCount(); const char *RefCountAction; if (!UpdateRefCount) { RefCountAction = " (update suppressed)"; - } else if (IsLast) { - HT.decRefCount(UseHoldRefCount); - assert(HT.getTotalRefCount() == 0 && + } else if (LR.TPR.Flags.IsLast) { + LR.TPR.getEntry()->decRefCount(UseHoldRefCount); + assert(LR.TPR.getEntry()->getTotalRefCount() == 0 && "Expected zero reference count when deletion is scheduled"); if (ForceDelete) RefCountAction = " (reset, delayed deletion)"; else RefCountAction = " (decremented, delayed deletion)"; } else { - HT.decRefCount(UseHoldRefCount); + LR.TPR.getEntry()->decRefCount(UseHoldRefCount); RefCountAction = " (decremented)"; } const char *DynRefCountAction = UseHoldRefCount ? "" : RefCountAction; const char *HoldRefCountAction = UseHoldRefCount ? RefCountAction : ""; - uintptr_t TP = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + uintptr_t TP = LR.TPR.getEntry()->TgtPtrBegin + + ((uintptr_t)HstPtrBegin - LR.TPR.getEntry()->HstPtrBegin); INFO(OMP_INFOTYPE_MAPPING_EXISTS, DeviceID, "Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " "Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s\n", - DPxPTR(HstPtrBegin), DPxPTR(TP), Size, HT.dynRefCountToStr().c_str(), - DynRefCountAction, HT.holdRefCountToStr().c_str(), HoldRefCountAction); - TargetPointer = (void *)TP; + DPxPTR(HstPtrBegin), DPxPTR(TP), Size, + LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction, + LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction); + LR.TPR.TargetPointer = (void *)TP; } else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { // If the value isn't found in the mapping and unified shared memory // is on then it means we have stumbled upon a value which we need to @@ -433,18 +438,18 @@ DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, DP("Get HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared " "memory\n", DPxPTR((uintptr_t)HstPtrBegin), Size); - IsPresent = false; - IsHostPtr = true; - TargetPointer = HstPtrBegin; + LR.TPR.Flags.IsPresent = false; + LR.TPR.Flags.IsHostPointer = true; + LR.TPR.TargetPointer = HstPtrBegin; } else { // OpenMP Specification v5.2: if a matching list item is not found, the // pointer retains its original value as per firstprivate semantics. - IsPresent = false; - IsHostPtr = false; - TargetPointer = HstPtrBegin; + LR.TPR.Flags.IsPresent = false; + LR.TPR.Flags.IsHostPointer = false; + LR.TPR.TargetPointer = HstPtrBegin; } - return {{IsNew, IsHostPtr, IsPresent}, LR.Entry, TargetPointer}; + return std::move(LR.TPR); } // Return the target pointer begin (where the data will be moved). @@ -453,8 +458,8 @@ void *DeviceTy::getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, uintptr_t HP = (uintptr_t)HstPtrBegin; LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size); if (LR.Flags.IsContained || LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) { - auto &HT = *LR.Entry; - uintptr_t TP = HT.TgtPtrBegin + (HP - HT.HstPtrBegin); + uintptr_t TP = + LR.TPR.getEntry()->TgtPtrBegin + (HP - LR.TPR.getEntry()->HstPtrBegin); return (void *)TP; } @@ -550,20 +555,32 @@ int32_t DeviceTy::deleteData(void *TgtPtrBegin, int32_t Kind) { return RTL->data_delete(RTLDeviceID, TgtPtrBegin, Kind); } +static void printCopyInfo(int DeviceId, bool H2D, void *SrcPtrBegin, + void *DstPtrBegin, int64_t Size, + HostDataToTargetTy *HT) { + + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Copying data from %s to %s, %sPtr=" DPxMOD ", %sPtr=" DPxMOD + ", Size=%" PRId64 ", Name=%s\n", + H2D ? "host" : "device", H2D ? "device" : "host", H2D ? "Hst" : "Tgt", + DPxPTR(SrcPtrBegin), H2D ? "Tgt" : "Hst", DPxPTR(DstPtrBegin), Size, + (HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str() + : "unknown"); +} + // Submit data to device int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, - AsyncInfoTy &AsyncInfo) { + AsyncInfoTy &AsyncInfo, + HostDataToTargetTy *Entry) { if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER) { - HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor(); - LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size); - auto *HT = &*LR.Entry; - - INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceID, - "Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD - ", Size=%" PRId64 ", Name=%s\n", - DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin), Size, - (HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str() - : "unknown"); + HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor(Entry); + LookupResult LR; + if (!Entry) { + LR = lookupMapping(HDTTMap, HstPtrBegin, Size); + Entry = LR.TPR.getEntry(); + } + printCopyInfo(DeviceID, /* H2D */ true, HstPtrBegin, TgtPtrBegin, Size, + Entry); } if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize) @@ -574,17 +591,17 @@ int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, // Retrieve data from device int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin, - int64_t Size, AsyncInfoTy &AsyncInfo) { + int64_t Size, AsyncInfoTy &AsyncInfo, + HostDataToTargetTy *Entry) { if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER) { - HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor(); - LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size); - auto *HT = &*LR.Entry; - INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceID, - "Copying data from device to host, TgtPtr=" DPxMOD ", HstPtr=" DPxMOD - ", Size=%" PRId64 ", Name=%s\n", - DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin), Size, - (HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str() - : "unknown"); + HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor(Entry); + LookupResult LR; + if (!Entry) { + LR = lookupMapping(HDTTMap, HstPtrBegin, Size); + Entry = LR.TPR.getEntry(); + } + printCopyInfo(DeviceID, /* H2D */ false, TgtPtrBegin, HstPtrBegin, Size, + Entry); } if (!RTL->data_retrieve_async || !RTL->synchronize) diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 2158b948bc9e..69af6ac24a41 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -633,6 +633,9 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, // may be considered a hack, we could revise the scheme in the future. bool UpdateRef = !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0); + + DeviceTy::HDTTMapAccessorTy HDTTMap = + Device.HostDataToTargetMap.getExclusiveAccessor(); if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { DP("Has a pointer entry: \n"); // Base is address of pointer. @@ -649,9 +652,11 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, // PTR_AND_OBJ entry is handled below, and so the allocation might fail // when HasPresentModifier. PointerTpr = Device.getTargetPointer( - HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr, + HDTTMap, HstPtrBase, HstPtrBase, sizeof(void *), + /*HstPtrName=*/nullptr, /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef, - HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo); + HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo, + /* OwnedTPR */ nullptr, /* ReleaseHDTTMap */ false); PointerTgtPtrBegin = PointerTpr.TargetPointer; IsHostPtr = PointerTpr.Flags.IsHostPointer; if (!PointerTgtPtrBegin) { @@ -675,10 +680,11 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO; const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; + // Note that HDTTMap will be released in getTargetPointer. auto TPR = Device.getTargetPointer( - HstPtrBegin, HstPtrBase, DataSize, HstPtrName, HasFlagTo, HasFlagAlways, - IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier, - HasHoldModifier, AsyncInfo); + HDTTMap, HstPtrBegin, HstPtrBase, DataSize, HstPtrName, HasFlagTo, + HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier, + HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry()); void *TgtPtrBegin = TPR.TargetPointer; IsHostPtr = TPR.Flags.IsHostPointer; // If data_size==0, then the argument could be a zero-length pointer to @@ -701,55 +707,30 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, } if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { - // Check whether we need to update the pointer on the device - bool UpdateDevPtr = false; uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); - Device.ShadowMtx.lock(); - auto Entry = Device.ShadowPtrMap.find(PointerHstPtrBegin); - // If this pointer is not in the map we need to insert it. If the map - // contains a stale entry, we need to update it (e.g. if the pointee was - // deallocated and later on is reallocated at another device address). The - // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An - // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ - // pair is deallocated, not when the OBJ is deallocated. In - // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer, - // so it stays in the map for the lifetime of the application. When the - // OBJ is deallocated and later on allocated again (at a different device - // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin - // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ. - if (Entry == Device.ShadowPtrMap.end() || - Entry->second.TgtPtrVal != ExpectedTgtPtrBase) { - // create or update shadow pointers for this entry - Device.ShadowPtrMap[PointerHstPtrBegin] = { - HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase}; - PointerTpr.Entry->setMayContainAttachedPointers(); - UpdateDevPtr = true; - } - - if (UpdateDevPtr) { - std::lock_guard LG(*PointerTpr.Entry); - Device.ShadowMtx.unlock(); - + if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{ + (void **)PointerHstPtrBegin, HstPtrBase, + (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) { DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); TgtPtrBase = ExpectedTgtPtrBase; - int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, - sizeof(void *), AsyncInfo); + int Ret = + Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *), + AsyncInfo, PointerTpr.getEntry()); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; } - if (PointerTpr.Entry->addEventIfNecessary(Device, AsyncInfo) != + if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; - } else - Device.ShadowMtx.unlock(); + } } } @@ -777,51 +758,11 @@ struct PostProcessingInfo { TargetPointerResultTy TPR; PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, - int32_t ArgIndex, TargetPointerResultTy TPR) + int32_t ArgIndex, TargetPointerResultTy &&TPR) : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), - ArgIndex(ArgIndex), TPR(TPR) {} + ArgIndex(ArgIndex), TPR(std::move(TPR)) {} }; -/// Apply \p CB to the shadow map pointer entries in the range \p Begin, to -/// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the -/// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the -/// rest of the map is not checked anymore. -template -static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin, - uintptr_t Size, - const TargetPointerResultTy &TPR) { - // If we have an object that is too small to hold a pointer subobject, no need - // to do any checking. - if (Size < sizeof(void *)) - return; - - // If the map entry for the object was never marked as containing attached - // pointers, no need to do any checking. - if (!TPR.Entry || !TPR.Entry->getMayContainAttachedPointers()) - return; - - uintptr_t LB = (uintptr_t)Begin; - uintptr_t UB = LB + Size; - // Now we are looking into the shadow map so we need to lock it. - std::lock_guard LG(Device.ShadowMtx); - for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin(); - Itr != Device.ShadowPtrMap.end();) { - uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first; - - // An STL map is sorted on its keys; use this property - // to quickly determine when to break out of the loop. - if (ShadowHstPtrAddr < LB) { - ++Itr; - continue; - } - if (ShadowHstPtrAddr >= UB) - break; - - if (CB(Itr) == OFFLOAD_FAIL) - break; - } -} - } // namespace /// Applies the necessary post-processing procedures to entries listed in \p @@ -831,7 +772,7 @@ static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin, /// according to the successfulness of the operations. [[nodiscard]] static int postProcessingTargetDataEnd(DeviceTy *Device, - SmallVector EntriesInfo, + SmallVector &EntriesInfo, bool FromMapper) { int Ret = OFFLOAD_SUCCESS; void *FromMapperBase = nullptr; @@ -859,10 +800,16 @@ postProcessingTargetDataEnd(DeviceTy *Device, // not request (exclusive) access to the HDTT map if DelEntry is // not set. DeviceTy::HDTTMapAccessorTy HDTTMap = - Device->HostDataToTargetMap.getExclusiveAccessor(!DelEntry); + Device->HostDataToTargetMap.getExclusiveAccessor(); - const bool IsNotLastUser = TPR.Entry->decDataEndThreadCount() != 0; - if (DelEntry && (TPR.Entry->getTotalRefCount() != 0 || IsNotLastUser)) { + // We cannot use a lock guard because we may end up delete the mutex. + // We also explicitly unlocked the entry after it was put in the EntriesInfo + // so it can be reused. + TPR.getEntry()->lock(); + auto *Entry = TPR.getEntry(); + + const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0; + if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) { // The thread is not in charge of deletion anymore. Give up access // to the HDTT map and unset the deletion flag. HDTTMap.destroy(); @@ -874,44 +821,35 @@ postProcessingTargetDataEnd(DeviceTy *Device, // shadow copies. If the struct is going to be deallocated, remove any // remaining shadow pointer entries for this struct. const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM; - auto CB = [&](ShadowPtrListTy::iterator &Itr) { - // If we copied the struct to the host, we need to restore the - // pointer. - if (HasFrom) { - void **ShadowHstPtrAddr = (void **)Itr->first; - *ShadowHstPtrAddr = Itr->second.HstPtrVal; - DP("Restoring original host pointer value " DPxMOD " for host " - "pointer " DPxMOD "\n", - DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); - } - // If the struct is to be deallocated, remove the shadow entry. - if (DelEntry) { - DP("Removing shadow pointer " DPxMOD "\n", DPxPTR((void **)Itr->first)); - auto OldItr = Itr; - Itr++; - Device->ShadowPtrMap.erase(OldItr); - } else { - ++Itr; - } - return OFFLOAD_SUCCESS; - }; - applyToShadowMapEntries(*Device, CB, HstPtrBegin, DataSize, TPR); + if (HasFrom) { + Entry->foreachShadowPointerInfo( + [&](const ShadowPtrInfoTy &ShadowPtr) { + *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; + DP("Restoring original host pointer value " DPxMOD " for host " + "pointer " DPxMOD "\n", + DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); + return OFFLOAD_SUCCESS; + }); + } + + // Give up the lock as we either don't need it anymore (e.g., done with + // TPR), or erase TPR. + TPR.setEntry(nullptr); if (!DelEntry || (FromMapperBase && FromMapperBase == HstPtrBegin)) continue; - // If we are deleting the entry the DataMapMtx is locked and we own - // the entry. - Ret = Device->eraseMapEntry(HDTTMap, TPR.Entry, DataSize); + Ret = Device->eraseMapEntry(HDTTMap, Entry, DataSize); // Entry is already remove from the map, we can unlock it now. HDTTMap.destroy(); - Ret |= Device->deallocTgtPtrAndEntry(TPR.Entry, DataSize); + Ret |= Device->deallocTgtPtrAndEntry(Entry, DataSize); if (Ret != OFFLOAD_SUCCESS) { REPORT("Deallocating data from device failed.\n"); break; } } + delete &EntriesInfo; return Ret; } @@ -921,7 +859,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { int Ret = OFFLOAD_SUCCESS; - SmallVector PostProcessingPtrs; + auto *PostProcessingPtrs = new SmallVector(); void *FromMapperBase = nullptr; // process each input. for (int32_t I = ArgNum - 1; I >= 0; --I) { @@ -972,7 +910,6 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, } } - bool IsLast, IsHostPtr; bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && @@ -982,9 +919,9 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; // If PTR_AND_OBJ, HstPtrBegin is address of pointee - TargetPointerResultTy TPR = Device.getTgtPtrBegin( - HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr, - !IsImplicit, ForceDelete, /*FromDataEnd=*/true); + TargetPointerResultTy TPR = + Device.getTgtPtrBegin(HstPtrBegin, DataSize, UpdateRef, HasHoldModifier, + !IsImplicit, ForceDelete, /*FromDataEnd=*/true); void *TgtPtrBegin = TPR.TargetPointer; if (!TPR.isPresent() && !TPR.isHostPointer() && (DataSize || HasPresentModifier)) { @@ -1014,7 +951,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, } else { DP("There are %" PRId64 " bytes allocated at target address " DPxMOD " - is%s last\n", - DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")); + DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not")); } // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: @@ -1028,20 +965,21 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, // Move data back to the host const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM; - if (HasFrom && (HasAlways || IsLast) && !IsHostPtr && DataSize != 0) { + if (HasFrom && (HasAlways || TPR.Flags.IsLast) && + !TPR.Flags.IsHostPointer && DataSize != 0) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - std::lock_guard LG(*TPR.Entry); // Wait for any previous transfer if an event is present. - if (void *Event = TPR.Entry->getEvent()) { + if (void *Event = TPR.getEntry()->getEvent()) { if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event)); return OFFLOAD_FAIL; } } - Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo); + Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo, + TPR.getEntry()); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -1052,26 +990,26 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, // as the entry can be reused and the reuse might happen after the // copy-back was issued but before it completed. Since the reuse might // also copy-back a value we would race. - if (IsLast) { - if (TPR.Entry->addEventIfNecessary(Device, AsyncInfo) != + if (TPR.Flags.IsLast) { + if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; } } // Add pointer to the buffer for post-synchronize processing. - PostProcessingPtrs.emplace_back(HstPtrBegin, DataSize, ArgTypes[I], I, TPR); + PostProcessingPtrs->emplace_back(HstPtrBegin, DataSize, ArgTypes[I], I, + std::move(TPR)); + PostProcessingPtrs->back().TPR.getEntry()->unlock(); } // Add post-processing functions // TODO: We might want to remove `mutable` in the future by not changing the // captured variables somehow. - AsyncInfo.addPostProcessingFunction( - [=, Device = &Device, - PostProcessingPtrs = std::move(PostProcessingPtrs)]() mutable -> int { - return postProcessingTargetDataEnd(Device, PostProcessingPtrs, - FromMapperBase); - }); + AsyncInfo.addPostProcessingFunction([=, Device = &Device]() mutable -> int { + return postProcessingTargetDataEnd(Device, *PostProcessingPtrs, + FromMapperBase); + }); return Ret; } @@ -1080,10 +1018,9 @@ static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, void *HstPtrBegin, int64_t ArgSize, int64_t ArgType, AsyncInfoTy &AsyncInfo) { TIMESCOPE_WITH_IDENT(Loc); - bool IsLast, IsHostPtr; - TargetPointerResultTy TPR = Device.getTgtPtrBegin( - HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false, - /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true); + TargetPointerResultTy TPR = + Device.getTgtPtrBegin(HstPtrBegin, ArgSize, /*UpdateRefCount=*/false, + /*UseHoldRefCount=*/false, /*MustContain=*/true); void *TgtPtrBegin = TPR.TargetPointer; if (!TPR.isPresent()) { DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); @@ -1096,16 +1033,48 @@ static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, return OFFLOAD_SUCCESS; } - if (IsHostPtr) { + if (TPR.Flags.IsHostPointer) { DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", DPxPTR(HstPtrBegin)); return OFFLOAD_SUCCESS; } + if (ArgType & OMP_TGT_MAPTYPE_TO) { + DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", + ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo, + TPR.getEntry()); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Copying data to device failed.\n"); + return OFFLOAD_FAIL; + } + if (TPR.getEntry()) { + int Ret = TPR.getEntry()->foreachShadowPointerInfo( + [&](const ShadowPtrInfoTy &ShadowPtr) { + DP("Restoring original target pointer value " DPxMOD " for target " + "pointer " DPxMOD "\n", + DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr)); + Ret = Device.submitData(ShadowPtr.TgtPtrAddr, + (void *)&ShadowPtr.TgtPtrVal, + sizeof(void *), AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Copying data to device failed.\n"); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; + }); + if (Ret != OFFLOAD_SUCCESS) { + DP("Updating shadow map failed\n"); + return Ret; + } + } + } + if (ArgType & OMP_TGT_MAPTYPE_FROM) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo); + int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo, + TPR.getEntry()); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -1113,44 +1082,26 @@ static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, // Wait for device-to-host memcopies for whole struct to complete, // before restoring the correct host pointer. - AsyncInfo.addPostProcessingFunction([=, Device = &Device]() -> int { - auto CB = [&](ShadowPtrListTy::iterator &Itr) { - void **ShadowHstPtrAddr = (void **)Itr->first; - *ShadowHstPtrAddr = Itr->second.HstPtrVal; - DP("Restoring original host pointer value " DPxMOD - " for host pointer " DPxMOD "\n", - DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); - ++Itr; + if (auto *Entry = TPR.getEntry()) { + AsyncInfo.addPostProcessingFunction([=]() -> int { + int Ret = Entry->foreachShadowPointerInfo( + [&](const ShadowPtrInfoTy &ShadowPtr) { + *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; + DP("Restoring original host pointer value " DPxMOD + " for host pointer " DPxMOD "\n", + DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); + return OFFLOAD_SUCCESS; + }); + Entry->unlock(); + if (Ret != OFFLOAD_SUCCESS) { + DP("Updating shadow map failed\n"); + return Ret; + } return OFFLOAD_SUCCESS; - }; - applyToShadowMapEntries(*Device, CB, HstPtrBegin, ArgSize, TPR); - - return OFFLOAD_SUCCESS; - }); - } - - if (ArgType & OMP_TGT_MAPTYPE_TO) { - DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Copying data to device failed.\n"); - return OFFLOAD_FAIL; + }); } - - auto CB = [&](ShadowPtrListTy::iterator &Itr) { - DP("Restoring original target pointer value " DPxMOD " for target " - "pointer " DPxMOD "\n", - DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr)); - Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal, - sizeof(void *), AsyncInfo); - if (Ret != OFFLOAD_SUCCESS) - REPORT("Copying data to device failed.\n"); - ++Itr; - return Ret; - }; - applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); } + return OFFLOAD_SUCCESS; } @@ -1538,7 +1489,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, void *HstPtrVal = Args[I]; void *HstPtrBegin = ArgBases[I]; void *HstPtrBase = Args[Idx]; - bool IsLast, IsHostPtr; // IsLast is unused. void *TgtPtrBase = (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); @@ -1546,15 +1496,15 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); TargetPointerResultTy TPR = Device.getTgtPtrBegin( - HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false, - /*UseHoldRefCount=*/false, IsHostPtr); + HstPtrVal, ArgSizes[I], /*UpdateRefCount=*/false, + /*UseHoldRefCount=*/false); PointerTgtPtrBegin = TPR.TargetPointer; if (!TPR.isPresent()) { DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", DPxPTR(HstPtrVal)); continue; } - if (IsHostPtr) { + if (TPR.Flags.IsHostPointer) { DP("Unified memory is active, no need to map lambda captured" "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal)); @@ -1563,7 +1513,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, - sizeof(void *), AsyncInfo); + sizeof(void *), AsyncInfo, TPR.getEntry()); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -1576,7 +1526,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, void *TgtPtrBegin; map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; ptrdiff_t TgtBaseOffset; - bool IsLast, IsHostPtr; // unused. TargetPointerResultTy TPR; if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { DP("Forwarding first-private value " DPxMOD " to the target construct\n", @@ -1603,9 +1552,9 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, } else { if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) HstPtrBase = *reinterpret_cast(HstPtrBase); - TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast, + TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], /*UpdateRefCount=*/false, - /*UseHoldRefCount=*/false, IsHostPtr); + /*UseHoldRefCount=*/false); TgtPtrBegin = TPR.TargetPointer; TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; #ifdef OMPTARGET_DEBUG