Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions cuda_core/cuda/core/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,12 @@ cdef class DeviceProperties:

cdef inline int _get_cached_attribute(self, attr, default=0) except? -2:
"""Retrieve the attribute value, using cache if applicable."""
if attr not in self._cache:
self._cache[attr] = self._get_attribute(attr, default)
return self._cache[attr]
cached = self._cache.get(attr)
if cached is not None:
return cached
cdef int value = self._get_attribute(attr, default)
self._cache[attr] = value # setdefault not needed for ints
return value

@property
def max_threads_per_block(self) -> int:
Expand Down Expand Up @@ -1125,11 +1128,11 @@ class Device:
def compute_capability(self) -> ComputeCapability:
"""Return a named tuple with 2 fields: major and minor."""
cdef DeviceProperties prop = self.properties
if "compute_capability" in prop._cache:
return prop._cache["compute_capability"]
cached = prop._cache.get("compute_capability")
if cached is not None:
return cached
cc = ComputeCapability(prop.compute_capability_major, prop.compute_capability_minor)
prop._cache["compute_capability"] = cc
return cc
return prop._cache.setdefault("compute_capability", cc)

@property
def arch(self) -> str:
Expand Down
3 changes: 3 additions & 0 deletions cuda_core/cuda/core/_device_resources.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: Apache-2.0

cimport cython

from cuda.bindings cimport cydriver
from cuda.core._resource_handles cimport ContextHandle, GreenCtxHandle

Expand All @@ -15,6 +17,7 @@ cdef class SMResource:
unsigned int _flags
bint _is_usable
object __weakref__
cython.pymutex _split_mutex

@staticmethod
cdef SMResource _from_dev_resource(cydriver.CUdevResource res, int device_id)
Expand Down
9 changes: 5 additions & 4 deletions cuda_core/cuda/core/_device_resources.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -493,10 +493,11 @@ cdef class SMResource:
)
_resolve_group_count(opts)
_check_green_ctx_support()
if _can_use_structured_sm_split():
return _split_with_general_api(self, opts, dry_run)
# SplitByCount requires the same 12.4+ as green ctx support (already checked above)
return _split_with_count_api(self, opts, dry_run)
with self._split_mutex:
if _can_use_structured_sm_split():
return _split_with_general_api(self, opts, dry_run)
# SplitByCount requires the same 12.4+ as green ctx support (already checked above)
return _split_with_count_api(self, opts, dry_run)
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pointing this out inline, because this is probably the thing I am least sure about. Should SMResource.split() be thread-safe if called on the same resource from multiple threads?



cdef class WorkqueueResource:
Expand Down
16 changes: 9 additions & 7 deletions cuda_core/cuda/core/_memory/_buffer.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

from libc.stdint cimport uintptr_t
from libcpp cimport bool as cpp_bool
from libcpp.atomic cimport atomic as std_atomic, memory_order_acquire, memory_order_release

from cuda.bindings cimport cydriver
from cuda.core._resource_handles cimport DevicePtrHandle
Expand All @@ -18,13 +20,13 @@ cdef struct _MemAttrs:

cdef class Buffer:
cdef:
DevicePtrHandle _h_ptr
MemoryResource _memory_resource
object _ipc_data
object _owner
_MemAttrs _mem_attrs
bint _mem_attrs_inited
object __weakref__
DevicePtrHandle _h_ptr
MemoryResource _memory_resource
object _ipc_data
object _owner
_MemAttrs _mem_attrs
std_atomic[cpp_bool] _mem_attrs_inited
object __weakref__
cdef public:
# Python code in _memory/_virtual_memory_resource.py needs to update
# this value, though it is technically private.
Expand Down
11 changes: 6 additions & 5 deletions cuda_core/cuda/core/_memory/_buffer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ cdef class Buffer:
self._memory_resource = None
self._ipc_data = None
self._owner = None
self._mem_attrs_inited = False
self._mem_attrs_inited.store(False)

def __init__(self, *args, **kwargs):
raise RuntimeError("Buffer objects cannot be instantiated directly. "
Expand Down Expand Up @@ -123,7 +123,7 @@ cdef class Buffer:
self._memory_resource = mr
self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
self._owner = owner
self._mem_attrs_inited = False
self._mem_attrs_inited.store(False)
return self

@staticmethod
Expand Down Expand Up @@ -188,6 +188,7 @@ cdef class Buffer:
return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream)

@property
@cython.critical_section
def ipc_descriptor(self) -> IPCBufferDescriptor:
"""Descriptor for sharing this buffer with other processes."""
if self._ipc_data is None:
Expand Down Expand Up @@ -444,9 +445,9 @@ cdef class Buffer:
# ------------------------------
cdef inline void _init_mem_attrs(Buffer self):
"""Initialize memory attributes by querying the pointer."""
if not self._mem_attrs_inited:
if not self._mem_attrs_inited.load(memory_order_acquire):
_query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr))
self._mem_attrs_inited = True
self._mem_attrs_inited.store(True, memory_order_release)


cdef inline int _query_memory_attrs(
Expand Down Expand Up @@ -588,7 +589,7 @@ cdef Buffer Buffer_from_deviceptr_handle(
buf._memory_resource = mr
buf._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
buf._owner = None
buf._mem_attrs_inited = False
buf._mem_attrs_inited.store(False)
return buf


Expand Down
14 changes: 11 additions & 3 deletions cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ from cuda.core._resource_handles cimport (
from cuda.core._stream cimport Stream_accept, Stream
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN

from functools import cache

__all__ = ['GraphMemoryResource']

Expand Down Expand Up @@ -149,6 +148,8 @@ cdef class cyGraphMemoryResource(MemoryResource):
return False


cdef dict _mem_resource_cache = {}

class GraphMemoryResource(cyGraphMemoryResource):
"""
A memory resource for memory related to graphs.
Expand All @@ -173,9 +174,16 @@ class GraphMemoryResource(cyGraphMemoryResource):
return cls._create(c_device_id)

@classmethod
@cache
def _create(cls, int device_id):
return cyGraphMemoryResource.__new__(cls, device_id)
# we use a dict currently, because functools.cache is currently less
# thread-safe see also: https://github.com/python/cpython/issues/150708
res = _mem_resource_cache.get(device_id)
if res is not None:
return res

# create new instance, but in case of a race may return another:
new = cyGraphMemoryResource.__new__(cls, device_id)
return _mem_resource_cache.setdefault(device_id, new)


# Raise an exception if the given stream is capturing.
Expand Down
2 changes: 2 additions & 0 deletions cuda_core/cuda/core/_memory/_memory_pool.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from __future__ import annotations

cimport cython
from libc.limits cimport ULLONG_MAX
from libc.stdint cimport uintptr_t
from libc.string cimport memset
Expand Down Expand Up @@ -164,6 +165,7 @@ cdef class _MemPool(MemoryResource):
_MP_deallocate(self, <uintptr_t>ptr, size, s)

@property
@cython.critical_section
def attributes(self) -> _MemPoolAttributes:
"""Memory pool attributes."""
if self._attributes is None:
Expand Down
6 changes: 5 additions & 1 deletion cuda_core/cuda/core/_memoryview.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from __future__ import annotations

cimport cython
from ._dlpack cimport *
from ._dlpack import classify_dl_device
from libc.stdint cimport intptr_t
Expand Down Expand Up @@ -78,7 +79,7 @@ cdef inline bint _is_torch_tensor(object obj):
cdef str mod = tp.__module__ or ""
cdef bint result = mod.startswith("torch") and hasattr(obj, "data_ptr") \
and _torch_version_check()
_torch_type_cache[tp] = result
_torch_type_cache[tp] = result # setdefault not needed for bools
return result


Expand Down Expand Up @@ -533,6 +534,7 @@ cdef class StridedMemoryView:
+ f" readonly={self.readonly},\n"
+ f" exporting_obj={get_simple_repr(self.exporting_obj)})")

@cython.critical_section
cdef inline _StridedLayout get_layout(self):
if self._layout is None:
if self.dl_tensor:
Expand All @@ -543,6 +545,7 @@ cdef class StridedMemoryView:
raise ValueError("Cannot infer layout from the exporting object")
return self._layout

@cython.critical_section
cdef inline object get_buffer(self):
"""
Returns Buffer instance with the underlying data.
Expand All @@ -556,6 +559,7 @@ cdef class StridedMemoryView:
self._buffer = Buffer.from_handle(self.ptr, 0, owner=self.exporting_obj)
return self._buffer

@cython.critical_section
cdef inline object get_dtype(self):
if self._dtype is None:
if self.dl_tensor != NULL:
Expand Down
6 changes: 5 additions & 1 deletion cuda_core/cuda/core/_module.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from __future__ import annotations

cimport cython
from libc.stddef cimport size_t

from collections import namedtuple
Expand Down Expand Up @@ -82,7 +83,7 @@ cdef class KernelAttributes:
cdef int result
with nogil:
HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(self._h_kernel), device_id))
self._cache[cache_key] = result
self._cache[cache_key] = result # setdefault not needed for ints
return result

def __getitem__(self, device) -> KernelAttributes:
Expand Down Expand Up @@ -451,6 +452,7 @@ cdef class Kernel:
return ker

@property
@cython.critical_section
def attributes(self) -> KernelAttributes:
"""Get the read-only attributes of this kernel."""
if self._attributes is None:
Expand Down Expand Up @@ -498,6 +500,7 @@ cdef class Kernel:
return param_info

@property
@cython.critical_section
def occupancy(self) -> KernelOccupancy:
"""Get the occupancy information for launching this kernel."""
if self._occupancy is None:
Expand Down Expand Up @@ -729,6 +732,7 @@ cdef class ObjectCode:

# TODO: do we want to unload in a finalizer? Probably not..

@cython.critical_section
cdef int _lazy_load_module(self) except -1:
if self._h_library:
return 0
Expand Down
8 changes: 4 additions & 4 deletions cuda_core/cuda/core/graph/_graph_node.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ _node_registry = weakref.WeakValueDictionary()


cdef inline GraphNode _registered(GraphNode n):
_node_registry[<uintptr_t>n._h_node.get()] = n
return n
return _node_registry.setdefault(<uintptr_t>n._h_node.get(), n)


cdef class GraphNode:
Expand Down Expand Up @@ -156,10 +155,11 @@ cdef class GraphNode:
cdef cydriver.CUgraphNode node = as_cu(self._h_node)
if node == NULL:
return
with nogil:
HANDLE_RETURN(cydriver.cuGraphDestroyNode(node))

_node_registry.pop(<uintptr_t>self._h_node.get(), None)
invalidate_graph_node(self._h_node)
with nogil:
HANDLE_RETURN(cydriver.cuGraphDestroyNode(node))

@property
def pred(self):
Expand Down
Loading