Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
pycodegen
pystencils
Commits
e91f23f7
Commit
e91f23f7
authored
Nov 11, 2020
by
Julian Hammer
Committed by
Markus Holzer
Nov 11, 2020
Browse files
improved kc coupling
parent
f3e81539
Changes
3
Hide whitespace changes
Inline
Side-by-side
pystencils/kerncraft_coupling/kerncraft_interface.py
View file @
e91f23f7
...
...
@@ -2,21 +2,23 @@ import warnings
import
fcntl
from
collections
import
defaultdict
from
tempfile
import
TemporaryDirectory
from
typing
import
Optional
from
jinja2
import
Environment
,
PackageLoader
,
StrictUndefined
import
textwrap
from
jinja2
import
Environment
,
PackageLoader
,
StrictUndefined
,
Template
import
sympy
as
sp
from
kerncraft.kerncraft
import
KernelCode
from
kerncraft.machinemodel
import
MachineModel
from
pystencils.astnodes
import
(
KernelFunction
,
LoopOverCoordinate
,
ResolvedFieldAccess
,
SympyAssignment
)
from
pystencils.astnodes
import
\
KernelFunction
,
LoopOverCoordinate
,
ResolvedFieldAccess
,
SympyAssignment
from
pystencils.backends.cbackend
import
generate_c
,
get_headers
from
pystencils.field
import
get_layout_from_strides
from
pystencils.sympyextensions
import
count_operations_in_ast
from
pystencils.transformations
import
filtered_tree_iteration
from
pystencils.utils
import
DotDict
from
pystencils.backends.cbackend
import
generate_c
,
get_headers
from
pystencils.cpu.kernelcreation
import
add_openmp
from
pystencils.data_types
import
get_base_type
from
pystencils.sympyextensions
import
prod
class
PyStencilsKerncraftKernel
(
KernelCode
):
...
...
@@ -26,7 +28,7 @@ class PyStencilsKerncraftKernel(KernelCode):
"""
LIKWID_BASE
=
'/usr/local/likwid'
def
__init__
(
self
,
ast
:
KernelFunction
,
machine
:
Optional
[
MachineModel
]
=
None
,
def
__init__
(
self
,
ast
:
KernelFunction
,
machine
:
MachineModel
,
assumed_layout
=
'SoA'
,
debug_print
=
False
,
filename
=
None
):
"""Create a kerncraft kernel using a pystencils AST
...
...
@@ -44,6 +46,7 @@ class PyStencilsKerncraftKernel(KernelCode):
# Initialize state
self
.
asm_block
=
None
self
.
_filename
=
filename
self
.
_keep_intermediates
=
False
self
.
kernel_ast
=
ast
self
.
temporary_dir
=
TemporaryDirectory
()
...
...
@@ -96,14 +99,14 @@ class PyStencilsKerncraftKernel(KernelCode):
target_dict
[
fa
.
field
.
name
].
append
(
permuted_coord
)
# Variables (arrays)
fields_accessed
=
ast
.
fields_accessed
fields_accessed
=
self
.
kernel_
ast
.
fields_accessed
for
field
in
fields_accessed
:
layout
=
get_layout_tuple
(
field
)
permuted_shape
=
list
(
field
.
shape
[
i
]
for
i
in
layout
)
self
.
set_variable
(
field
.
name
,
tuple
([
str
(
field
.
dtype
)
]
),
tuple
(
permuted_shape
))
self
.
set_variable
(
field
.
name
,
(
str
(
field
.
dtype
)
,
),
tuple
(
permuted_shape
))
# Scalars may be safely ignored
# for param in ast.get_parameters():
# for param in
self.kernel_
ast.get_parameters():
# if not param.is_field_parameter:
# # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
# self.sources[param.symbol.name] = [None]
...
...
@@ -138,7 +141,10 @@ class PyStencilsKerncraftKernel(KernelCode):
file_path
=
self
.
get_intermediate_location
(
file_name
,
machine_and_compiler_dependent
=
False
)
lock_mode
,
lock_fp
=
self
.
lock_intermediate
(
file_path
)
if
lock_mode
==
fcntl
.
LOCK_EX
:
if
lock_mode
==
fcntl
.
LOCK_SH
:
# use cache
pass
else
:
# lock_mode == fcntl.LOCK_EX:
function_signature
=
generate_c
(
self
.
kernel_ast
,
dialect
=
'c'
,
signature_only
=
True
)
jinja_context
=
{
...
...
@@ -150,13 +156,12 @@ class PyStencilsKerncraftKernel(KernelCode):
with
open
(
file_path
,
'w'
)
as
f
:
f
.
write
(
file_header
)
fcntl
.
flock
(
lock_fp
,
fcntl
.
LOCK_SH
)
# degrade to shared lock
self
.
release_exclusive_lock
(
lock_fp
)
# degrade to shared lock
return
file_path
,
lock_fp
def
get_kernel_code
(
self
,
openmp
=
False
,
name
=
'pystencils_kernl'
):
"""
Generate and return compilable source code.
Generate and return compilable source code
from AST
.
Args:
openmp: if true, openmp code will be generated
...
...
@@ -169,7 +174,11 @@ class PyStencilsKerncraftKernel(KernelCode):
file_path
=
self
.
get_intermediate_location
(
filename
,
machine_and_compiler_dependent
=
False
)
lock_mode
,
lock_fp
=
self
.
lock_intermediate
(
file_path
)
if
lock_mode
==
fcntl
.
LOCK_EX
:
if
lock_mode
==
fcntl
.
LOCK_SH
:
# use cache
with
open
(
file_path
)
as
f
:
code
=
f
.
read
()
else
:
# lock_mode == fcntl.LOCK_EX:
header_list
=
get_headers
(
self
.
kernel_ast
)
includes
=
"
\n
"
.
join
([
"#include %s"
%
(
include_file
,)
for
include_file
in
header_list
])
...
...
@@ -184,11 +193,136 @@ class PyStencilsKerncraftKernel(KernelCode):
}
env
=
Environment
(
loader
=
PackageLoader
(
'pystencils.kerncraft_coupling'
),
undefined
=
StrictUndefined
)
file_hea
de
r
=
env
.
get_template
(
'kernel.c'
).
render
(
**
jinja_context
)
co
de
=
env
.
get_template
(
'kernel.c'
).
render
(
**
jinja_context
)
with
open
(
file_path
,
'w'
)
as
f
:
f
.
write
(
file_header
)
f
.
write
(
code
)
self
.
release_exclusive_lock
(
lock_fp
)
# degrade to shared lock
return
file_path
,
lock_fp
CODE_TEMPLATE
=
Template
(
textwrap
.
dedent
(
"""
#include <likwid.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include "kerncraft.h"
#include "kernel.h"
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
extern int var_false;
int main(int argc, char **argv) {
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
{%- endfor %}
// Declaring arrays
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
// TODO initialize in parallel context in same order as they are touched
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
{%- endfor %}
likwid_markerInit();
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
// Initializing arrays in same order as touched in kernel loop nest
//INIT_ARRAYS;
// Dummy call
{%- for field_name, dataType, size in fields %}
if(var_false) dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy(&{{constantName}});
{%- endfor %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
likwid_markerStartRegion("loop");
}
for(; repeat > 0; --repeat) {
{{kernelName}}({{call_argument_list}});
{%- for field_name, dataType, size in fields %}
if(var_false) dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy(&{{constantName}});
{%- endfor %}
}
}
likwid_markerStopRegion("loop");
}
likwid_markerClose();
return 0;
}
"""
))
fcntl
.
flock
(
lock_fp
,
fcntl
.
LOCK_SH
)
# degrade to shared lock
def
get_main_code
(
self
,
kernel_function_name
=
'kernel'
):
"""
Generate and return compilable source code from AST.
:return: tuple of filename and shared lock file pointer
"""
# TODO produce nicer code, including help text and other "comfort features".
assert
self
.
kernel_ast
is
not
None
,
"AST does not exist, this could be due to running "
\
"based on a kernel description rather than code."
file_path
=
self
.
get_intermediate_location
(
'main.c'
,
machine_and_compiler_dependent
=
False
)
lock_mode
,
lock_fp
=
self
.
lock_intermediate
(
file_path
)
if
lock_mode
==
fcntl
.
LOCK_SH
:
# use cache
with
open
(
file_path
)
as
f
:
code
=
f
.
read
()
else
:
# lock_mode == fcntl.LOCK_EX
# needs update
accessed_fields
=
{
f
.
name
:
f
for
f
in
self
.
kernel_ast
.
fields_accessed
}
constants
=
[]
fields
=
[]
call_parameters
=
[]
for
p
in
self
.
kernel_ast
.
get_parameters
():
if
not
p
.
is_field_parameter
:
constants
.
append
((
p
.
symbol
.
name
,
str
(
p
.
symbol
.
dtype
)))
call_parameters
.
append
(
p
.
symbol
.
name
)
else
:
assert
p
.
is_field_pointer
,
"Benchmark implemented only for kernels with fixed loop size"
field
=
accessed_fields
[
p
.
field_name
]
dtype
=
str
(
get_base_type
(
p
.
symbol
.
dtype
))
fields
.
append
((
p
.
field_name
,
dtype
,
prod
(
field
.
shape
)))
call_parameters
.
append
(
p
.
field_name
)
header_list
=
get_headers
(
self
.
kernel_ast
)
includes
=
"
\n
"
.
join
([
"#include %s"
%
(
include_file
,)
for
include_file
in
header_list
])
# Generate code
code
=
self
.
CODE_TEMPLATE
.
render
(
kernelName
=
self
.
kernel_ast
.
function_name
,
fields
=
fields
,
constants
=
constants
,
call_agument_list
=
','
.
join
(
call_parameters
),
includes
=
includes
)
# Store to file
with
open
(
file_path
,
'w'
)
as
f
:
f
.
write
(
code
)
self
.
release_exclusive_lock
(
lock_fp
)
# degrade to shared lock
return
file_path
,
lock_fp
...
...
pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
View file @
e91f23f7
kerncraft version
:
0.8.
3
.dev0
kerncraft version
:
0.8.
6
.dev0
model name
:
Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
model type
:
Intel Xeon SandyBridge EN/EP processor
clock
:
2.7 GHz
...
...
@@ -8,6 +8,7 @@ cores per socket: 8
threads per core
:
2
NUMA domains per socket
:
1
cores per NUMA domain
:
8
transparent hugepage
:
always
in-core model
:
!!omap
-
IACA
:
SNB
...
...
@@ -20,17 +21,22 @@ FLOPs per cycle:
DP
:
{
total
:
8
,
ADD
:
4
,
MUL
:
4
}
compiler
:
!!omap
-
icc
:
-O3 -xAVX -fno-alias -qopenmp
-
clang
:
-O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp
-
gcc
:
-O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm
-
icc
:
-O3 -xAVX -fno-alias -qopenmp
-ffreestanding -nolib-inline
-
clang
:
-O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp
-ffreestanding
-
gcc
:
-O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm
-ffreestanding
overlapping model
:
ports
:
IACA
:
[
'
0'
,
0DV
,
'
1'
,
'
2'
,
'
3'
,
'
4'
,
'
5'
]
OSACA
:
[
'
0'
,
0DV
,
'
1'
,
'
2'
,
'
3'
,
'
4'
,
'
5'
]
LLVM-MCA
:
[
SBDivider
,
SBFPDivider
,
SBPort0
,
SBPort1
,
SBPort23
,
SBPort4
,
SBPort5
]
performance counter metric
:
Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
non-overlapping model
:
ports
:
[
2D
,
3D
]
ports
:
IACA
:
[
2D
,
3D
]
OSACA
:
[
2D
,
3D
]
LLVM-MCA
:
[
SBPort23
]
performance counter metric
:
T_nOL + T_L1L2 + T_L2L3 + T_L3MEM
overlapping model
:
ports
:
[
'
0'
,
0DV
,
'
1'
,
'
2'
,
'
3'
,
'
4'
,
'
5'
]
performance counter metric
:
Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
cacheline size
:
64 B
memory hierarchy
:
...
...
pystencils_tests/test_kerncraft_coupling.py
View file @
e91f23f7
...
...
@@ -42,9 +42,7 @@ def test_compilation():
@
pytest
.
mark
.
kerncraft
def
analysis
(
kernel
,
model
=
'ecmdata'
):
machine_file_path
=
INPUT_FOLDER
/
"Example_SandyBridgeEP_E5-2680.yml"
machine
=
MachineModel
(
path_to_yaml
=
machine_file_path
)
def
analysis
(
kernel
,
machine
,
model
=
'ecmdata'
):
if
model
==
'ecmdata'
:
model
=
ECMData
(
kernel
,
machine
,
KerncraftParameters
())
elif
model
==
'ecm'
:
...
...
@@ -71,7 +69,7 @@ def test_3d_7pt_osaca():
reference_kernel
.
set_constant
(
'M'
,
size
[
0
])
reference_kernel
.
set_constant
(
'N'
,
size
[
1
])
assert
size
[
1
]
==
size
[
2
]
analysis
(
reference_kernel
,
model
=
'ecm'
)
analysis
(
reference_kernel
,
machine_model
,
model
=
'ecm'
)
arr
=
np
.
zeros
(
size
)
a
=
Field
.
create_from_numpy_array
(
'a'
,
arr
,
index_dimensions
=
0
)
...
...
@@ -82,18 +80,22 @@ def test_3d_7pt_osaca():
update_rule
=
Assignment
(
b
[
0
,
0
,
0
],
s
*
rhs
)
ast
=
create_kernel
([
update_rule
])
k
=
PyStencilsKerncraftKernel
(
ast
,
machine
=
machine_model
)
analysis
(
k
,
model
=
'ecm'
)
analysis
(
k
,
machine_model
,
model
=
'ecm'
)
assert
reference_kernel
.
_flops
==
k
.
_flops
# assert reference.results['cl throughput'] == analysis.results['cl throughput']
@
pytest
.
mark
.
kerncraft
def
test_2d_5pt
():
machine_file_path
=
INPUT_FOLDER
/
"Example_SandyBridgeEP_E5-2680.yml"
machine
=
MachineModel
(
path_to_yaml
=
machine_file_path
)
size
=
[
30
,
50
,
3
]
kernel_file_path
=
INPUT_FOLDER
/
"2d-5pt.c"
with
open
(
kernel_file_path
)
as
kernel_file
:
reference_kernel
=
KernelCode
(
kernel_file
.
read
(),
machine
=
None
,
filename
=
kernel_file_path
)
reference
=
analysis
(
reference_kernel
)
reference_kernel
=
KernelCode
(
kernel_file
.
read
(),
machine
=
machine
,
filename
=
kernel_file_path
)
reference
=
analysis
(
reference_kernel
,
machine
)
arr
=
np
.
zeros
(
size
)
a
=
Field
.
create_from_numpy_array
(
'a'
,
arr
,
index_dimensions
=
1
)
...
...
@@ -102,8 +104,8 @@ def test_2d_5pt():
rhs
=
a
[
0
,
-
1
](
0
)
+
a
[
0
,
1
]
+
a
[
-
1
,
0
]
+
a
[
1
,
0
]
update_rule
=
Assignment
(
b
[
0
,
0
],
s
*
rhs
)
ast
=
create_kernel
([
update_rule
])
k
=
PyStencilsKerncraftKernel
(
ast
)
result
=
analysis
(
k
)
k
=
PyStencilsKerncraftKernel
(
ast
,
machine
)
result
=
analysis
(
k
,
machine
)
for
e1
,
e2
in
zip
(
reference
.
results
[
'cycles'
],
result
.
results
[
'cycles'
]):
assert
e1
==
e2
...
...
@@ -111,14 +113,18 @@ def test_2d_5pt():
@
pytest
.
mark
.
kerncraft
def
test_3d_7pt
():
machine_file_path
=
INPUT_FOLDER
/
"Example_SandyBridgeEP_E5-2680.yml"
machine
=
MachineModel
(
path_to_yaml
=
machine_file_path
)
size
=
[
30
,
50
,
50
]
kernel_file_path
=
INPUT_FOLDER
/
"3d-7pt.c"
with
open
(
kernel_file_path
)
as
kernel_file
:
reference_kernel
=
KernelCode
(
kernel_file
.
read
(),
machine
=
None
,
filename
=
kernel_file_path
)
reference_kernel
=
KernelCode
(
kernel_file
.
read
(),
machine
=
machine
,
filename
=
kernel_file_path
)
reference_kernel
.
set_constant
(
'M'
,
size
[
0
])
reference_kernel
.
set_constant
(
'N'
,
size
[
1
])
assert
size
[
1
]
==
size
[
2
]
reference
=
analysis
(
reference_kernel
)
reference
=
analysis
(
reference_kernel
,
machine
)
arr
=
np
.
zeros
(
size
)
a
=
Field
.
create_from_numpy_array
(
'a'
,
arr
,
index_dimensions
=
0
)
...
...
@@ -128,8 +134,8 @@ def test_3d_7pt():
update_rule
=
Assignment
(
b
[
0
,
0
,
0
],
s
*
rhs
)
ast
=
create_kernel
([
update_rule
])
k
=
PyStencilsKerncraftKernel
(
ast
)
result
=
analysis
(
k
)
k
=
PyStencilsKerncraftKernel
(
ast
,
machine
)
result
=
analysis
(
k
,
machine
)
for
e1
,
e2
in
zip
(
reference
.
results
[
'cycles'
],
result
.
results
[
'cycles'
]):
assert
e1
==
e2
...
...
@@ -163,6 +169,9 @@ def test_benchmark():
@
pytest
.
mark
.
kerncraft
def
test_kerncraft_generic_field
():
machine_file_path
=
INPUT_FOLDER
/
"Example_SandyBridgeEP_E5-2680.yml"
machine
=
MachineModel
(
path_to_yaml
=
machine_file_path
)
a
=
fields
(
'a: double[3D]'
)
b
=
fields
(
'b: double[3D]'
)
s
=
sp
.
Symbol
(
"s"
)
...
...
@@ -170,4 +179,4 @@ def test_kerncraft_generic_field():
update_rule
=
Assignment
(
b
[
0
,
0
,
0
],
s
*
rhs
)
ast
=
create_kernel
([
update_rule
])
k
=
PyStencilsKerncraftKernel
(
ast
,
debug_print
=
True
)
k
=
PyStencilsKerncraftKernel
(
ast
,
machine
,
debug_print
=
True
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment