/* Run a stand-alone AMD GCN kernel.
Copyright 2017 Mentor Graphics Corporation
Copyright (C) 2018-2020 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see . */
/* This program will run a compiled stand-alone GCN kernel on a GPU.
The kernel entry point's signature must use a standard main signature:
int main(int argc, char **argv)
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
/* These probably won't be in elf.h for a while. */
#ifndef R_AMDGPU_NONE
#define R_AMDGPU_NONE 0
#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
#define R_AMDGPU_ABS64 3 /* S + A */
#define R_AMDGPU_REL32 4 /* S + A - P */
#define R_AMDGPU_REL64 5 /* S + A - P */
#define R_AMDGPU_ABS32 6 /* S + A */
#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
#define reserved 12
#define R_AMDGPU_RELATIVE64 13 /* B + A */
#endif
#include "hsa.h"
#ifndef HSA_RUNTIME_LIB
#define HSA_RUNTIME_LIB "libhsa-runtime64.so"
#endif
#ifndef VERSION_STRING
#define VERSION_STRING "(version unknown)"
#endif
bool debug = false;
hsa_agent_t device = { 0 };
hsa_queue_t *queue = NULL;
uint64_t init_array_kernel = 0;
uint64_t fini_array_kernel = 0;
uint64_t main_kernel = 0;
hsa_executable_t executable = { 0 };
hsa_region_t kernargs_region = { 0 };
hsa_region_t heap_region = { 0 };
uint32_t kernarg_segment_size = 0;
uint32_t group_segment_size = 0;
uint32_t private_segment_size = 0;
static void
usage (const char *progname)
{
printf ("Usage: %s [options] kernel [kernel-args]\n\n"
"Options:\n"
" --help\n"
" --version\n"
" --debug\n", progname);
}
static void
version (const char *progname)
{
printf ("%s " VERSION_STRING "\n", progname);
}
/* As an HSA runtime is dlopened, following structure defines the necessary
function pointers.
Code adapted from libgomp. */
struct hsa_runtime_fn_info
{
/* HSA runtime. */
hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
const char **status_string);
hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
hsa_agent_info_t attribute,
void *value);
hsa_status_t (*hsa_init_fn) (void);
hsa_status_t (*hsa_iterate_agents_fn)
(hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
hsa_region_info_t attribute,
void *value);
hsa_status_t (*hsa_queue_create_fn)
(hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
void *data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t **queue);
hsa_status_t (*hsa_agent_iterate_regions_fn)
(hsa_agent_t agent,
hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
hsa_status_t (*hsa_executable_create_fn)
(hsa_profile_t profile, hsa_executable_state_t executable_state,
const char *options, hsa_executable_t *executable);
hsa_status_t (*hsa_executable_global_variable_define_fn)
(hsa_executable_t executable, const char *variable_name, void *address);
hsa_status_t (*hsa_executable_load_code_object_fn)
(hsa_executable_t executable, hsa_agent_t agent,
hsa_code_object_t code_object, const char *options);
hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
const char *options);
hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
uint32_t num_consumers,
const hsa_agent_t *consumers,
hsa_signal_t *signal);
hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
void **ptr);
hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
hsa_access_permission_t access);
hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
size_t size);
hsa_status_t (*hsa_memory_free_fn) (void *ptr);
hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
hsa_status_t (*hsa_executable_get_symbol_fn)
(hsa_executable_t executable, const char *module_name,
const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
hsa_executable_symbol_t *symbol);
hsa_status_t (*hsa_executable_symbol_get_info_fn)
(hsa_executable_symbol_t executable_symbol,
hsa_executable_symbol_info_t attribute, void *value);
void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
hsa_signal_value_t value);
hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
(hsa_signal_t signal, hsa_signal_condition_t condition,
hsa_signal_value_t compare_value, uint64_t timeout_hint,
hsa_wait_state_t wait_state_hint);
hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
(hsa_signal_t signal, hsa_signal_condition_t condition,
hsa_signal_value_t compare_value, uint64_t timeout_hint,
hsa_wait_state_t wait_state_hint);
hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
hsa_status_t (*hsa_code_object_deserialize_fn)
(void *serialized_code_object, size_t serialized_code_object_size,
const char *options, hsa_code_object_t *code_object);
uint64_t (*hsa_queue_load_write_index_relaxed_fn)
(const hsa_queue_t *queue);
void (*hsa_queue_store_write_index_relaxed_fn)
(const hsa_queue_t *queue, uint64_t value);
hsa_status_t (*hsa_shut_down_fn) ();
};
/* HSA runtime functions that are initialized in init_hsa_context.
Code adapted from libgomp. */
static struct hsa_runtime_fn_info hsa_fns;
#define DLSYM_FN(function) \
*(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
if (hsa_fns.function##_fn == NULL) \
goto fail;
static void
init_hsa_runtime_functions (void)
{
void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
if (handle == NULL)
{
fprintf (stderr,
"The HSA runtime is required to run GCN kernels on hardware.\n"
"%s: File not found or could not be opened\n",
HSA_RUNTIME_LIB);
exit (1);
}
DLSYM_FN (hsa_status_string)
DLSYM_FN (hsa_agent_get_info)
DLSYM_FN (hsa_init)
DLSYM_FN (hsa_iterate_agents)
DLSYM_FN (hsa_region_get_info)
DLSYM_FN (hsa_queue_create)
DLSYM_FN (hsa_agent_iterate_regions)
DLSYM_FN (hsa_executable_destroy)
DLSYM_FN (hsa_executable_create)
DLSYM_FN (hsa_executable_global_variable_define)
DLSYM_FN (hsa_executable_load_code_object)
DLSYM_FN (hsa_executable_freeze)
DLSYM_FN (hsa_signal_create)
DLSYM_FN (hsa_memory_allocate)
DLSYM_FN (hsa_memory_assign_agent)
DLSYM_FN (hsa_memory_copy)
DLSYM_FN (hsa_memory_free)
DLSYM_FN (hsa_signal_destroy)
DLSYM_FN (hsa_executable_get_symbol)
DLSYM_FN (hsa_executable_symbol_get_info)
DLSYM_FN (hsa_signal_wait_acquire)
DLSYM_FN (hsa_signal_wait_relaxed)
DLSYM_FN (hsa_signal_store_relaxed)
DLSYM_FN (hsa_queue_destroy)
DLSYM_FN (hsa_code_object_deserialize)
DLSYM_FN (hsa_queue_load_write_index_relaxed)
DLSYM_FN (hsa_queue_store_write_index_relaxed)
DLSYM_FN (hsa_shut_down)
return;
fail:
fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
exit (1);
}
#undef DLSYM_FN
/* Report a fatal error STR together with the HSA error corresponding to
STATUS and terminate execution of the current process. */
static void
hsa_fatal (const char *str, hsa_status_t status)
{
const char *hsa_error_msg;
hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
hsa_error_msg);
exit (1);
}
/* Helper macros to ensure we check the return values from the HSA Runtime.
These just keep the rest of the code a bit cleaner. */
#define XHSA_CMP(FN, CMP, MSG) \
do { \
hsa_status_t status = (FN); \
if (!(CMP)) \
hsa_fatal ((MSG), status); \
else if (debug) \
fprintf (stderr, "%s: OK\n", (MSG)); \
} while (0)
#define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
/* Callback of hsa_iterate_agents.
Called once for each available device, and returns "break" when a
suitable one has been found. */
static hsa_status_t
get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
{
hsa_device_type_t device_type;
XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
&device_type),
"Get agent type");
/* Select only GPU devices. */
/* TODO: support selecting from multiple GPUs. */
if (HSA_DEVICE_TYPE_GPU == device_type)
{
device = agent;
return HSA_STATUS_INFO_BREAK;
}
/* The device was not suitable. */
return HSA_STATUS_SUCCESS;
}
/* Callback of hsa_iterate_regions.
Called once for each available memory region, and returns "break" when a
suitable one has been found. */
static hsa_status_t
get_memory_region (hsa_region_t region, hsa_region_t *retval,
hsa_region_global_flag_t kind)
{
/* Reject non-global regions. */
hsa_region_segment_t segment;
hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
if (HSA_REGION_SEGMENT_GLOBAL != segment)
return HSA_STATUS_SUCCESS;
/* Find a region with the KERNARG flag set. */
hsa_region_global_flag_t flags;
hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
&flags);
if (flags & kind)
{
*retval = region;
return HSA_STATUS_INFO_BREAK;
}
/* The region was not suitable. */
return HSA_STATUS_SUCCESS;
}
static hsa_status_t
get_kernarg_region (hsa_region_t region, void *data __attribute__((unused)))
{
return get_memory_region (region, &kernargs_region,
HSA_REGION_GLOBAL_FLAG_KERNARG);
}
static hsa_status_t
get_heap_region (hsa_region_t region, void *data __attribute__((unused)))
{
return get_memory_region (region, &heap_region,
HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
}
/* Initialize the HSA Runtime library and GPU device. */
static void
init_device ()
{
/* Load the shared library and find the API functions. */
init_hsa_runtime_functions ();
/* Initialize the HSA Runtime. */
XHSA (hsa_fns.hsa_init_fn (),
"Initialize run-time");
/* Select a suitable device.
The call-back function, get_gpu_agent, does the selection. */
XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
"Find a device");
/* Initialize the queue used for launching kernels. */
uint32_t queue_size = 0;
XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
&queue_size),
"Find max queue size");
XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
HSA_QUEUE_TYPE_SINGLE, NULL,
NULL, UINT32_MAX, UINT32_MAX, &queue),
"Set up a device queue");
/* Select a memory region for the kernel arguments.
The call-back function, get_kernarg_region, does the selection. */
XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
NULL),
status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
"Locate kernargs memory");
/* Select a memory region for the kernel heap.
The call-back function, get_heap_region, does the selection. */
XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region,
NULL),
status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
"Locate device memory");
}
/* Read a whole input file.
Code copied from mkoffload. */
static char *
read_file (const char *filename, size_t *plen)
{
size_t alloc = 16384;
size_t base = 0;
char *buffer;
FILE *stream = fopen (filename, "rb");
if (!stream)
{
perror (filename);
exit (1);
}
if (!fseek (stream, 0, SEEK_END))
{
/* Get the file size. */
long s = ftell (stream);
if (s >= 0)
alloc = s + 100;
fseek (stream, 0, SEEK_SET);
}
buffer = malloc (alloc);
for (;;)
{
size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
if (!n)
break;
base += n;
if (base + 1 == alloc)
{
alloc *= 2;
buffer = realloc (buffer, alloc);
}
}
buffer[base] = 0;
*plen = base;
fclose (stream);
return buffer;
}
/* Read a HSA Code Object (HSACO) from file, and load it into the device. */
static void
load_image (const char *filename)
{
size_t image_size;
Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
/* An "executable" consists of one or more code objects. */
XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
HSA_EXECUTABLE_STATE_UNFROZEN, "",
&executable),
"Initialize GCN executable");
/* Hide relocations from the HSA runtime loader.
Keep a copy of the unmodified section headers to use later. */
Elf64_Shdr *image_sections =
(Elf64_Shdr *) ((char *) image + image->e_shoff);
Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
for (int i = image->e_shnum - 1; i >= 0; i--)
{
if (image_sections[i].sh_type == SHT_RELA
|| image_sections[i].sh_type == SHT_REL)
/* Change section type to something harmless. */
image_sections[i].sh_type = SHT_NOTE;
}
/* Add the HSACO to the executable. */
hsa_code_object_t co = { 0 };
XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
"Deserialize GCN code object");
XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
""),
"Load GCN code object");
/* We're done modifying he executable. */
XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
"Freeze GCN executable");
/* Locate the "_init_array" function, and read the kernel's properties. */
hsa_executable_symbol_t symbol;
XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array",
device, 0, &symbol),
"Find '_init_array' function");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel),
"Extract '_init_array' kernel object kernel object");
/* Locate the "_fini_array" function, and read the kernel's properties. */
XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array",
device, 0, &symbol),
"Find '_fini_array' function");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel),
"Extract '_fini_array' kernel object kernel object");
/* Locate the "main" function, and read the kernel's properties. */
XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
device, 0, &symbol),
"Find 'main' function");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel),
"Extract 'main' kernel object");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&kernarg_segment_size),
"Extract kernarg segment size");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&group_segment_size),
"Extract group segment size");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&private_segment_size),
"Extract private segment size");
/* Find main function in ELF, and calculate actual load offset. */
Elf64_Addr load_offset;
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
&load_offset),
"Extract 'main' symbol address");
for (int i = 0; i < image->e_shnum; i++)
if (sections[i].sh_type == SHT_SYMTAB)
{
Elf64_Shdr *strtab = §ions[sections[i].sh_link];
char *strings = (char *) image + strtab->sh_offset;
for (size_t offset = 0;
offset < sections[i].sh_size;
offset += sections[i].sh_entsize)
{
Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
+ sections[i].sh_offset + offset);
if (strcmp ("main", strings + sym->st_name) == 0)
{
load_offset -= sym->st_value;
goto found_main;
}
}
}
/* We only get here when main was not found.
This should never happen. */
fprintf (stderr, "Error: main function not found.\n");
abort ();
found_main:;
/* Find dynamic symbol table. */
Elf64_Shdr *dynsym = NULL;
for (int i = 0; i < image->e_shnum; i++)
if (sections[i].sh_type == SHT_DYNSYM)
{
dynsym = §ions[i];
break;
}
/* Fix up relocations. */
for (int i = 0; i < image->e_shnum; i++)
{
if (sections[i].sh_type == SHT_RELA)
for (size_t offset = 0;
offset < sections[i].sh_size;
offset += sections[i].sh_entsize)
{
Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
+ sections[i].sh_offset
+ offset);
Elf64_Sym *sym =
(dynsym
? (Elf64_Sym *) ((char *) image
+ dynsym->sh_offset
+ (dynsym->sh_entsize
* ELF64_R_SYM (reloc->r_info))) : NULL);
int64_t S = (sym ? sym->st_value : 0);
int64_t P = reloc->r_offset + load_offset;
int64_t A = reloc->r_addend;
int64_t B = load_offset;
int64_t V, size;
switch (ELF64_R_TYPE (reloc->r_info))
{
case R_AMDGPU_ABS32_LO:
V = (S + A) & 0xFFFFFFFF;
size = 4;
break;
case R_AMDGPU_ABS32_HI:
V = (S + A) >> 32;
size = 4;
break;
case R_AMDGPU_ABS64:
V = S + A;
size = 8;
break;
case R_AMDGPU_REL32:
V = S + A - P;
size = 4;
break;
case R_AMDGPU_REL64:
/* FIXME
LLD seems to emit REL64 where the assembler has ABS64.
This is clearly wrong because it's not what the compiler
is expecting. Let's assume, for now, that it's a bug.
In any case, GCN kernels are always self contained and
therefore relative relocations will have been resolved
already, so this should be a safe workaround. */
V = S + A /* - P */ ;
size = 8;
break;
case R_AMDGPU_ABS32:
V = S + A;
size = 4;
break;
/* TODO R_AMDGPU_GOTPCREL */
/* TODO R_AMDGPU_GOTPCREL32_LO */
/* TODO R_AMDGPU_GOTPCREL32_HI */
case R_AMDGPU_REL32_LO:
V = (S + A - P) & 0xFFFFFFFF;
size = 4;
break;
case R_AMDGPU_REL32_HI:
V = (S + A - P) >> 32;
size = 4;
break;
case R_AMDGPU_RELATIVE64:
V = B + A;
size = 8;
break;
default:
fprintf (stderr, "Error: unsupported relocation type.\n");
exit (1);
}
XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
"Fix up relocation");
}
}
}
/* Allocate some device memory from the kernargs region.
The returned address will be 32-bit (with excess zeroed on 64-bit host),
and accessible via the same address on both host and target (via
__flat_scalar GCN address space). */
static void *
device_malloc (size_t size, hsa_region_t region)
{
void *result;
XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result),
"Allocate device memory");
return result;
}
/* These are the device pointers that will be transferred to the target.
The HSA Runtime points the kernargs register here.
They correspond to function signature:
int main (int argc, char *argv[], int *return_value)
The compiler expects this, for kernel functions, and will
automatically assign the exit value to *return_value. */
struct kernargs
{
/* Kernargs. */
int32_t argc;
int64_t argv;
int64_t out_ptr;
int64_t heap_ptr;
/* Output data. */
struct output
{
int return_value;
unsigned int next_output;
struct printf_data
{
int written;
char msg[128];
int type;
union
{
int64_t ivalue;
double dvalue;
char text[128];
};
} queue[1024];
unsigned int consumed;
} output_data;
};
struct heap
{
int64_t size;
char data[0];
} heap;
/* Print any console output from the kernel.
We print all entries from "consumed" to the next entry without a "written"
flag, or "next_output" is reached. The buffer is circular, but the
indices are absolute. It is assumed the kernel will stop writing data
if "next_output" wraps (becomes smaller than "consumed"). */
void
gomp_print_output (struct kernargs *kernargs, bool final)
{
unsigned int limit = (sizeof (kernargs->output_data.queue)
/ sizeof (kernargs->output_data.queue[0]));
unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
__ATOMIC_ACQUIRE);
unsigned int to = kernargs->output_data.next_output;
if (from > to)
{
/* Overflow. */
if (final)
printf ("GCN print buffer overflowed.\n");
return;
}
unsigned int i;
for (i = from; i < to; i++)
{
struct printf_data *data = &kernargs->output_data.queue[i%limit];
if (!data->written && !final)
break;
switch (data->type)
{
case 0:
printf ("%.128s%ld\n", data->msg, data->ivalue);
break;
case 1:
printf ("%.128s%f\n", data->msg, data->dvalue);
break;
case 2:
printf ("%.128s%.128s\n", data->msg, data->text);
break;
case 3:
printf ("%.128s%.128s", data->msg, data->text);
break;
default:
printf ("GCN print buffer error!\n");
break;
}
data->written = 0;
__atomic_store_n (&kernargs->output_data.consumed, i+1,
__ATOMIC_RELEASE);
}
fflush (stdout);
}
/* Execute an already-loaded kernel on the device. */
static void
run (uint64_t kernel, void *kernargs)
{
/* A "signal" is used to launch and monitor the kernel. */
hsa_signal_t signal;
XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
"Create signal");
/* Configure for a single-worker kernel. */
uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
const uint32_t queueMask = queue->size - 1;
hsa_kernel_dispatch_packet_t *dispatch_packet =
&(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
queueMask]);
dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
dispatch_packet->workgroup_size_x = (uint16_t) 1;
dispatch_packet->workgroup_size_y = (uint16_t) 64;
dispatch_packet->workgroup_size_z = (uint16_t) 1;
dispatch_packet->grid_size_x = 1;
dispatch_packet->grid_size_y = 64;
dispatch_packet->grid_size_z = 1;
dispatch_packet->completion_signal = signal;
dispatch_packet->kernel_object = kernel;
dispatch_packet->kernarg_address = (void *) kernargs;
dispatch_packet->private_segment_size = private_segment_size;
dispatch_packet->group_segment_size = group_segment_size;
uint16_t header = 0;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
__atomic_store_n ((uint32_t *) dispatch_packet,
header | (dispatch_packet->setup << 16),
__ATOMIC_RELEASE);
if (debug)
fprintf (stderr, "Launch kernel\n");
hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
/* Kernel running ...... */
while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
1, 1000000,
HSA_WAIT_STATE_ACTIVE) != 0)
{
usleep (10000);
gomp_print_output (kernargs, false);
}
gomp_print_output (kernargs, true);
if (debug)
fprintf (stderr, "Kernel exited\n");
XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
"Clean up signal");
}
int
main (int argc, char *argv[])
{
int kernel_arg = 0;
for (int i = 1; i < argc; i++)
{
if (!strcmp (argv[i], "--help"))
{
usage (argv[0]);
return 0;
}
else if (!strcmp (argv[i], "--version"))
{
version (argv[0]);
return 0;
}
else if (!strcmp (argv[i], "--debug"))
debug = true;
else if (argv[i][0] == '-')
{
usage (argv[0]);
return 1;
}
else
{
kernel_arg = i;
break;
}
}
if (!kernel_arg)
{
/* No kernel arguments were found. */
usage (argv[0]);
return 1;
}
/* The remaining arguments are for the GCN kernel. */
int kernel_argc = argc - kernel_arg;
char **kernel_argv = &argv[kernel_arg];
init_device ();
load_image (kernel_argv[0]);
/* Calculate size of function parameters + argv data. */
size_t args_size = 0;
for (int i = 0; i < kernel_argc; i++)
args_size += strlen (kernel_argv[i]) + 1;
/* Allocate device memory for both function parameters and the argv
data. */
struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
kernargs_region);
struct argdata
{
int64_t argv_data[kernel_argc];
char strings[args_size];
} *args = device_malloc (sizeof (struct argdata), kernargs_region);
size_t heap_size = 10 * 1024 * 1024; /* 10MB. */
struct heap *heap = device_malloc (heap_size, heap_region);
XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
HSA_ACCESS_PERMISSION_RW),
"Assign heap to device agent");
/* Write the data to the target. */
kernargs->argc = kernel_argc;
kernargs->argv = (int64_t) args->argv_data;
kernargs->out_ptr = (int64_t) &kernargs->output_data;
kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
kernargs->output_data.next_output = 0;
for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
/ sizeof (kernargs->output_data.queue[0])); i++)
kernargs->output_data.queue[i].written = 0;
kernargs->output_data.consumed = 0;
int offset = 0;
for (int i = 0; i < kernel_argc; i++)
{
size_t arg_len = strlen (kernel_argv[i]) + 1;
args->argv_data[i] = (int64_t) &args->strings[offset];
memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
offset += arg_len;
}
kernargs->heap_ptr = (int64_t) heap;
hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
/* Run constructors on the GPU. */
run (init_array_kernel, kernargs);
/* Run the kernel on the GPU. */
run (main_kernel, kernargs);
unsigned int return_value =
(unsigned int) kernargs->output_data.return_value;
/* Run destructors on the GPU. */
run (fini_array_kernel, kernargs);
unsigned int upper = (return_value & ~0xffff) >> 16;
if (upper == 0xcafe)
{
printf ("Kernel exit value was never set\n");
return_value = 0xff;
}
else if (upper == 0xffff)
; /* Set by exit. */
else if (upper == 0)
; /* Set by return from main. */
else
printf ("Possible kernel exit value corruption, 2 most significant bytes "
"aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
if (upper == 0xffff)
{
unsigned int signal = (return_value >> 8) & 0xff;
if (signal == SIGABRT)
printf ("Kernel aborted\n");
else if (signal != 0)
printf ("Kernel received unkown signal\n");
}
if (debug)
printf ("Kernel exit value: %d\n", return_value & 0xff);
/* Clean shut down. */
XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
"Clean up device memory");
XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
"Clean up GCN executable");
XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
"Clean up device queue");
XHSA (hsa_fns.hsa_shut_down_fn (),
"Shut down run-time");
return return_value & 0xff;
}