|
本帖最后由 bigbing 于 2023-6-22 17:18 编辑
aerial中包含了整个5G物理层的GPU代码,但我们只需要其中涉及LDPC的部分。LDPC编码部分在 aerial/cuPHY/examples/ldpc_encode下。最终目的是在OAI中运行,因此需要重新编写该部分,这样才能在OAI中正常运行。
新建的文件(你自己建立)需要放在aerial/cuPHY/examples/error_correction,文件的名称后缀名需要以.cu结尾,否则不能运行CUDA编写的函数。以下是程序具体内容:
```c++
#include <stdio.h>
#include <limits>
#include "ldpc_decode_test_vec_gen.hpp"
#include "cuphy.h"
#include <stdlib.h>
#include <string>
#include <iostream>
#include <vector>
#include <getopt.h>
#include <bitset>
#include "cuphy.hpp"
#include "cuphy_hdf5.hpp"
#include "hdf5hpp.hpp"
#include <immintrin.h>
#define REVERSE_BIT(dat) \
(((dat & 0x01) << 7) | ((dat & 0x02) << 5) | ((dat & 0x04) << 3) | ((dat & 0x08) << 1) | \
((dat & 0x10) >> 1) | ((dat & 0x20) >> 3) | ((dat & 0x40) >> 5) | ((dat & 0x80) >> 7))
__global__ void reverse_bit(unsigned char *output, uint8_t* input, int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < size)
{
output[tid] = REVERSE_BIT(input[tid]);
}
}
__global__ void reverse_bit_8block(unsigned char *input, int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < size)
{
unsigned char value = input[tid];
input[tid] = REVERSE_BIT(value);
}
}
__global__ void convertToBinary_8block(unsigned char *input, unsigned char *output, int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < size)
{
unsigned char value = input[tid];
for(int i=0; i<=7; i++)
{
output[tid*8 + i] = (value & (1<<i)) ? 1 : 0;
}
}
}
void gpu_LDPCEncode_8block(unsigned char** input, unsigned char** output, int Zc, int Kb, int K, int BG, int macro_num, int n_segments)
{
unsigned int macro_segment, macro_segment_end;
macro_segment = 8*macro_num;
macro_segment_end = (n_segments > 8*(macro_num+1)) ? 8*(macro_num+1) : n_segments;
int C = macro_segment_end - macro_segment;
if (C==0) {
return;
}
int ncwnodes;
unsigned char* gpuOutput, *gpuInput;
if(BG == 1)
{
ncwnodes = CUPHY_LDPC_MAX_BG1_UNPUNCTURED_VAR_NODES;
}
else
{
ncwnodes = CUPHY_LDPC_MAX_BG2_UNPUNCTURED_VAR_NODES;
}
const int N = Zc * (ncwnodes + 2);
int K_padding = (K % 32 != 0) * (32 - (K % 32));
int N_padding = (N % 32 != 0) * (32 - (N % 32));
cudaMalloc((void**)&gpuOutput, (N+N_padding) * sizeof(unsigned char)*C);
cudaMalloc((void**)&gpuInput, (K+K_padding) * sizeof(unsigned char)*C);
cuphy::tensor_device d_in_tensor(CUPHY_BIT, K, C);
cuphy::tensor_device d_out_tensor(CUPHY_BIT, N, C);
for(int j = 0; j < C; j++)
{
CUDA_CHECK(cudaMemcpy(&gpuInput[j*(K+K_padding)/8], input[j], (K+K_padding)/8, cudaMemcpyHostToDevice));
}
int block = 1024;
int size_h = (K+K_padding)*C;
int grid = (size_h + block - 1) / block;
reverse_bit<<<grid, block>>>((uint8_t*)d_in_tensor.addr(), gpuInput, size_h);
std::unique_ptr<cuphyLDPCEncodeLaunchConfig> ldpc_hndl = std::make_unique<cuphyLDPCEncodeLaunchConfig>();
size_t desc_size = 0;
size_t alloc_size = 0;
size_t workspace_size = 0; // in bytes
int max_UEs = PDSCH_MAX_UES_PER_CELL_GROUP;
int maxParityNodes = 0;
int rv = 0;
static cuphyStatus_t s = cuphyLDPCEncodeGetDescrInfo(&desc_size,
&alloc_size,
max_UEs,
&workspace_size);
if(s != CUPHY_STATUS_SUCCESS)
{
throw cuphy::cuphy_fn_exception(s, "cuphyLDPCEncodeGetDescrInfo()");
}
static cuphy::unique_device_ptr<uint8_t> d_ldpc_desc = cuphy::make_unique_device<uint8_t>(desc_size);
static cuphy::unique_pinned_ptr<uint8_t> h_ldpc_desc = cuphy::make_unique_pinned<uint8_t>(desc_size);
static cuphy::unique_device_ptr<uint8_t> d_workspace = cuphy::make_unique_device<uint8_t>(workspace_size);
static cuphy::unique_pinned_ptr<uint8_t> h_workspace = cuphy::make_unique_pinned<uint8_t>(workspace_size);
cudaStream_t cuda_strm = 0;
s = cuphySetupLDPCEncode(ldpc_hndl.get(), // launch config (output)
d_in_tensor.desc().handle(), // source descriptor
d_in_tensor.addr(), // source address
d_out_tensor.desc().handle(), // destination descriptor
d_out_tensor.addr(), // destination address
BG, // base graph
Zc, // lifting size
true, // puncture output bits
maxParityNodes, // max parity nodes
rv, // redundancy version
0,
1,
nullptr,
nullptr,
h_workspace.get(),
d_workspace.get(),
h_ldpc_desc.get(), // host descriptor
d_ldpc_desc.get(), // device descriptor
1, // do async copy during setup
cuda_strm);
if(CUPHY_STATUS_SUCCESS != s)
{
throw cuphy::cuphy_fn_exception(s, "cuphySetupLDPCEncode()");
}
launch_kernel(ldpc_hndl.get()->m_kernelNodeParams, cuda_strm);
CUDA_CHECK_EXCEPTION(cudaStreamSynchronize(cuda_strm));
int block_size = 1024;
int size = (N+N_padding)/8*C;
int grid_size = (size + block_size - 1) / block_size;
convertToBinary_8block<<<grid_size, block_size>>>((unsigned char*)d_out_tensor.addr(), gpuOutput, size);
for (int i = 0; i < C; i++)
{
CUDA_CHECK(cudaMemcpy(output[i+macro_segment], &gpuOutput[(N+N_padding)*i], (N)* sizeof(unsigned char), cudaMemcpyDeviceToHost));
}
cudaFree(gpuOutput);
cudaFree(gpuInput);
}
```
**代码说明:**
输入参数中,Zc是lift size, Kb是基图的信息比特所占列数,block_length是CB块长度,j和n_segments是用来计算每一次编码开头段和结束段,即每一次编码CB数(OAI中最大一次编码8个CB,GPU编码暂且按照这个标准,但是GPU编码并没有具体限制)。
开始先申明所需要的变量,其中包括GPU编码所需要的一些变量声明。在aerial中,GPU变量均是tensor_device形式,这种变量才能直接访问GPU内存,而不需要进行CPU到GPU的转换。
``` c++
cuphy::tensor_device d_in_tensor(CUPHY_BIT, K, C);
cuphy::tensor_device d_out_tensor(CUPHY_BIT, N, C);
```
输入的是二维数组,这里需要将其变为一维再传递给GPU,即
```C++
for(int j = 0; j < C; j++)
{
CUDA_CHECK(cudaMemcpy(&gpuInput[j*(K+K_padding)/8], input[j], (K+K_padding)/8, cudaMemcpyHostToDevice));
}
```
由于GPU输入不是按照顺序字节输入,即不是大端模式,而是小端模式,高字节放在高地址,低字节放在低地址,但是OAI中是大端,所以输入的信息比特需要按照每8比特翻转一次,即(OAI中以unsigned char数组存储,所以只需对数组中的元素进行比特翻转即可)。即代码中的
```C++
int block = 1024;
int size_h = (K+K_padding)*C;
int grid = (size_h + block - 1) / block;
reverse_bit<<<grid, block>>>((uint8_t*)d_in_tensor.addr(), gpuInput, size_h);
```
上述代码是使用CUDA C写的,是专门为GPU编程的(有兴趣可以了解)。
后面一直到
```c++
launch_kernel(ldpc_hndl.get()->m_kernelNodeParams, 0);
```
都是GPU编码前的一些准备,这些参数不需要改变,保持就行。而真正运行GPU LDPC编码的就是上述内核函数(如果想要看GPU如何编码,可以参见aerial/cuPHY/src/cuphy/error_correction/ldpc_encode.cu 140行ldpc_encode_in_bit_kernel函数)。
顺便说一下,GPU编译码的代码与matlab中nrLDPCEncode和nrLDPCDecode函数编译码方式相同,也可以参考。
编好的码字放在d_out_tensor中,在此变量中,码字是按照逐比特的方式来存放的,当我们以unsigned char类型将码字从GPU中拷到CPU中时,得到的是每8个比特的结果,因此需要将其改为逐字节的形式,此时得到的结果就是每一个比特占1个字节(比如在GPU中是255,需要将其变为11111111,这样就将1个unsigned char元素变成了8个)。此部分也使用CUDA编程,即
```c++
int block_size = 1024;
int size = (N+N_padding)/8*C;
int grid_size = (size + block_size - 1) / block_size;
convertToBinary_8block<<<grid_size, block_size>>>((unsigned char*)d_out_tensor.addr(), gpuOutput, size);
```
最后将数据拷到CPU,即
```c++
for (int i = 0; i < C; i++)
{
CUDA_CHECK(cudaMemcpy(output[i+macro_segment], &gpuOutput[(N+N_padding)*i], (N)* sizeof(unsigned char), cudaMemcpyDeviceToHost));
}
```
|
|