Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Jonas Plewinski
pystencils
Commits
a57a164a
Commit
a57a164a
authored
Sep 22, 2019
by
Stephan Seitz
Browse files
llvm: Mark CUDA kernels and load/call resulting ptx with pycuda
parent
2e6f3efe
Changes
3
Hide whitespace changes
Inline
Side-by-side
pystencils/llvm/llvm.py
View file @
a57a164a
...
...
@@ -13,6 +13,24 @@ from pystencils.data_types import (
from
pystencils.llvm.control_flow
import
Loop
# From Numba
def
set_cuda_kernel
(
lfunc
):
from
llvmlite.llvmpy.core
import
MetaData
,
MetaDataString
,
Constant
,
Type
m
=
lfunc
.
module
ops
=
lfunc
,
MetaDataString
.
get
(
m
,
"kernel"
),
Constant
.
int
(
Type
.
int
(),
1
)
md
=
MetaData
.
get
(
m
,
ops
)
nmd
=
m
.
get_or_insert_named_metadata
(
'nvvm.annotations'
)
nmd
.
add
(
md
)
# set nvvm ir version
i32
=
ir
.
IntType
(
32
)
md_ver
=
m
.
add_metadata
([
i32
(
1
),
i32
(
2
),
i32
(
2
),
i32
(
0
)])
m
.
add_named_metadata
(
'nvvmir.version'
,
md_ver
)
# From Numba
def
_call_sreg
(
builder
,
name
):
module
=
builder
.
module
...
...
@@ -191,6 +209,9 @@ class LLVMPrinter(Printer):
self
.
_print
(
func
.
body
)
self
.
builder
.
ret_void
()
self
.
fn
=
fn
if
self
.
target
==
'gpu'
:
set_cuda_kernel
(
fn
)
return
fn
def
_print_Block
(
self
,
block
):
...
...
pystencils/llvm/llvmjit.py
View file @
a57a164a
import
ctypes
as
ct
import
subprocess
from
functools
import
partial
from
itertools
import
chain
from
os.path
import
exists
,
join
import
llvmlite.binding
as
llvm
...
...
@@ -103,9 +105,9 @@ def generate_and_jit(ast):
target
=
'gpu'
if
ast
.
_backend
==
'llvm_gpu'
else
'cpu'
gen
=
generate_llvm
(
ast
,
target
=
target
)
if
isinstance
(
gen
,
ir
.
Module
):
return
compile_llvm
(
gen
,
target
)
return
compile_llvm
(
gen
,
target
,
ast
)
else
:
return
compile_llvm
(
gen
.
module
,
target
)
return
compile_llvm
(
gen
.
module
,
target
,
ast
)
def
make_python_function
(
ast
,
argument_dict
=
{},
func
=
None
):
...
...
@@ -120,8 +122,8 @@ def make_python_function(ast, argument_dict={}, func=None):
return
lambda
:
func
(
*
args
)
def
compile_llvm
(
module
,
target
=
'cpu'
):
jit
=
CudaJit
()
if
target
==
"gpu"
else
Jit
()
def
compile_llvm
(
module
,
target
=
'cpu'
,
ast
=
None
):
jit
=
CudaJit
(
ast
)
if
target
==
"gpu"
else
Jit
()
jit
.
parse
(
module
)
jit
.
optimize
()
jit
.
compile
()
...
...
@@ -243,12 +245,13 @@ class CudaJit(Jit):
default_data_layout
=
data_layout
[
MACHINE_BITS
]
def
__init__
(
self
):
def
__init__
(
self
,
ast
):
# super().__init__()
# self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
self
.
_data_layout
=
self
.
default_data_layout
[
self
.
MACHINE_BITS
]
# self._target_data = llvm.create_target_data(self._data_layout)
self
.
indexing
=
ast
.
indexing
def
optimize
(
self
):
pmb
=
llvm
.
create_pass_manager_builder
()
...
...
@@ -278,7 +281,6 @@ class CudaJit(Jit):
llvmmod
.
verify
()
llvmmod
.
name
=
'module'
self
.
module
=
str
(
llvmmod
)
self
.
_llvmmod
=
llvm
.
parse_assembly
(
str
(
llvmmod
))
def
compile
(
self
):
...
...
@@ -287,48 +289,30 @@ class CudaJit(Jit):
compiler_cache
=
get_cache_config
()[
'object_cache'
]
ir_file
=
join
(
compiler_cache
,
hashlib
.
md5
(
str
(
self
.
_llvmmod
).
encode
()).
hexdigest
()
+
'.ll'
)
ptx_file
=
ir_file
.
replace
(
'.ll'
,
'.ptx'
)
try
:
from
pycuda.driver
import
Context
arch
=
"sm_%d%d"
%
Context
.
get_device
().
compute_capability
()
except
Exception
:
arch
=
"sm_35"
if
not
exists
(
ptx_file
):
self
.
write_ll
(
ir_file
)
try
:
from
pycuda.driver
import
Context
arch
=
"sm_%d%d"
%
Context
.
get_device
().
compute_capability
()
except
Exception
:
arch
=
"sm_35"
subprocess
.
check_call
([
'llc-10'
,
'-mcpu='
+
arch
,
ir_file
,
'-o'
,
ptx_file
])
# TODO: make loading of ptx work
# import pycuda.autoinit
# def handler(compile_success_bool, info_str, error_str):
# if not compile_success_bool:
# print(info_str)
# print(error_str)
# cubin_file = ir_file.replace('.ll', '.cubin')
# if not exists(cubin_file):
# subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file])
import
pycuda.driver
# # with open(ptx_file, 'rb') as f:
# # ptx_code = f.read()
# # from pycuda.driver import jit_input_type
# # self.linker.add_data(ptx_code, jit_input_type.PTX, 'foo')
# from pycuda.compiler import DynamicModule
# from pycuda.driver import jit_input_type
# module = DynamicModule().add_file(ptx_file, jit_input_type.PTX)
# module.link()
# # cuda_module = pycuda.driver.module_from_buffer(ptx_code, message_handler=handler)
# # print(dir(cuda_module))
# self.fptr = dict()
# module.get_function('kernel')
cuda_module
=
pycuda
.
driver
.
module_from_file
(
ptx_file
)
# also works: cubin_file
self
.
cuda_module
=
cuda_module
def
__call__
(
self
,
func
,
*
args
,
**
kwargs
):
fptr
=
{}
for
func
in
self
.
module
.
functions
:
if
not
func
.
is_declaration
:
return_type
=
None
if
func
.
ftype
.
return_type
!=
ir
.
VoidType
():
return_type
=
to_ctypes
(
create_composite_type_from_string
(
str
(
func
.
ftype
.
return_type
)))
args
=
[
ctypes_from_llvm
(
arg
)
for
arg
in
func
.
ftype
.
args
]
function_address
=
self
.
ee
.
get_function_address
(
func
.
name
)
fptr
[
func
.
name
]
=
ct
.
CFUNCTYPE
(
return_type
,
*
args
)(
function_address
)
self
.
fptr
=
fptr
shape
=
[
a
.
shape
for
a
in
chain
(
args
,
kwargs
.
values
())
if
hasattr
(
a
,
'shape'
)][
0
]
block_and_thread_numbers
=
self
.
indexing
.
call_parameters
(
shape
)
block_and_thread_numbers
[
'block'
]
=
tuple
(
int
(
i
)
for
i
in
block_and_thread_numbers
[
'block'
])
block_and_thread_numbers
[
'grid'
]
=
tuple
(
int
(
i
)
for
i
in
block_and_thread_numbers
[
'grid'
])
self
.
cuda_module
.
get_function
(
func
)(
*
args
,
**
kwargs
,
**
block_and_thread_numbers
)
def
get_function_ptr
(
self
,
name
):
return
partial
(
self
.
_call__
,
name
)
pystencils_tests/test_jacobi_llvm.py
View file @
a57a164a
...
...
@@ -33,13 +33,19 @@ def test_jacobi_fixed_field_size():
def
test_jacobi_fixed_field_size_gpu
():
size
=
(
30
,
20
)
import
pycuda.autoinit
# noqa
from
pycuda.gpuarray
import
to_gpu
src_field_llvm
=
np
.
random
.
rand
(
*
size
)
src_field_py
=
np
.
copy
(
src_field_llvm
)
dst_field_llvm
=
np
.
zeros
(
size
)
dst_field_py
=
np
.
zeros
(
size
)
f
=
Field
.
create_from_numpy_array
(
"f"
,
src_field_llvm
)
d
=
Field
.
create_from_numpy_array
(
"d"
,
dst_field_llvm
)
f
=
Field
.
create_from_numpy_array
(
"f"
,
src_field_py
)
d
=
Field
.
create_from_numpy_array
(
"d"
,
dst_field_py
)
src_field_llvm
=
to_gpu
(
src_field_llvm
)
dst_field_llvm
=
to_gpu
(
dst_field_llvm
)
jacobi
=
Assignment
(
d
[
0
,
0
],
(
f
[
1
,
0
]
+
f
[
-
1
,
0
]
+
f
[
0
,
1
]
+
f
[
0
,
-
1
])
/
4
)
ast
=
create_kernel
([
jacobi
],
target
=
'gpu'
)
...
...
@@ -52,7 +58,7 @@ def test_jacobi_fixed_field_size_gpu():
jit
=
generate_and_jit
(
ast
)
jit
(
'kernel'
,
dst_field_llvm
,
src_field_llvm
)
error
=
np
.
sum
(
np
.
abs
(
dst_field_py
-
dst_field_llvm
))
error
=
np
.
sum
(
np
.
abs
(
dst_field_py
-
dst_field_llvm
.
get
()
))
np
.
testing
.
assert_almost_equal
(
error
,
0.0
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment