GPU LDPC 编码

bigbing 发表于 2023-6-22 17:13:42

本帖最后由 bigbing 于 2023-6-22 17:18 编辑

aerial中包含了整个5G物理层的GPU代码，但我们只需要其中涉及LDPC的部分。LDPC编码部分在 aerial/cuPHY/examples/ldpc_encode下。最终目的是在OAI中运行，因此需要重新编写该部分，这样才能在OAI中正常运行。

新建的文件（你自己建立）需要放在aerial/cuPHY/examples/error_correction，文件的名称后缀名需要以.cu结尾，否则不能运行CUDA编写的函数。以下是程序具体内容：

```c++
#include <stdio.h>
#include <limits>
#include "ldpc_decode_test_vec_gen.hpp"
#include "cuphy.h"
#include <stdlib.h>
#include <string>
#include <iostream>
#include <vector>
#include <getopt.h>
#include <bitset>
#include "cuphy.hpp"
#include "cuphy_hdf5.hpp"
#include "hdf5hpp.hpp"
#include <immintrin.h>

#define REVERSE_BIT(dat) \
(((dat & 0x01) << 7) | ((dat & 0x02) << 5) | ((dat & 0x04) << 3) | ((dat & 0x08) << 1) | \
((dat & 0x10) >> 1) | ((dat & 0x20) >> 3) | ((dat & 0x40) >> 5) | ((dat & 0x80) >> 7))

__global__ void reverse_bit(unsigned char *output, uint8_t* input, int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < size)
{
   output = REVERSE_BIT(input);
}
}

__global__ void reverse_bit_8block(unsigned char *input, int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < size)
{
   unsigned char value = input;
   input = REVERSE_BIT(value);
}
}

__global__ void convertToBinary_8block(unsigned char *input, unsigned char *output, int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < size)
{
   unsigned char value = input;
   for(int i=0; i<=7; i++)
   {
         output = (value & (1<<i)) ? 1 : 0;
   }
}
}

void gpu_LDPCEncode_8block(unsigned char** input, unsigned char** output, int Zc, int Kb, int K, int BG, int macro_num, int n_segments)
{
unsigned int macro_segment, macro_segment_end;
macro_segment = 8*macro_num;
macro_segment_end = (n_segments > 8*(macro_num+1)) ? 8*(macro_num+1) : n_segments;
int C = macro_segment_end - macro_segment;
if (C==0) {
   return;
}
int ncwnodes;
unsigned char* gpuOutput, *gpuInput;

if(BG == 1)
{
   ncwnodes = CUPHY_LDPC_MAX_BG1_UNPUNCTURED_VAR_NODES;
}
else
{
   ncwnodes = CUPHY_LDPC_MAX_BG2_UNPUNCTURED_VAR_NODES;
}
const int N = Zc * (ncwnodes + 2);

int K_padding = (K % 32 != 0) * (32 - (K % 32));
int N_padding = (N % 32 != 0) * (32 - (N % 32));
cudaMalloc((void**)&gpuOutput, (N+N_padding) * sizeof(unsigned char)*C);
cudaMalloc((void**)&gpuInput, (K+K_padding) * sizeof(unsigned char)*C);

cuphy::tensor_device d_in_tensor(CUPHY_BIT, K, C);
cuphy::tensor_device d_out_tensor(CUPHY_BIT, N, C);

for(int j = 0; j < C; j++)
{
   CUDA_CHECK(cudaMemcpy(&gpuInput, input, (K+K_padding)/8, cudaMemcpyHostToDevice));
}

int block = 1024;
int size_h = (K+K_padding)*C;
int grid = (size_h + block - 1) / block;
reverse_bit<<<grid, block>>>((uint8_t*)d_in_tensor.addr(), gpuInput, size_h);

std::unique_ptr<cuphyLDPCEncodeLaunchConfig> ldpc_hndl = std::make_unique<cuphyLDPCEncodeLaunchConfig>();
size_t                   desc_size= 0;
size_t                   alloc_size = 0;
size_t                   workspace_size = 0; // in bytes
int                      max_UEs = PDSCH_MAX_UES_PER_CELL_GROUP;
int                      maxParityNodes = 0;
int                      rv = 0;
static cuphyStatus_t             s       = cuphyLDPCEncodeGetDescrInfo(&desc_size,
                                                                     &alloc_size,
                                                                     max_UEs,
                                                                     &workspace_size);
if(s != CUPHY_STATUS_SUCCESS)
{
   throw cuphy::cuphy_fn_exception(s, "cuphyLDPCEncodeGetDescrInfo()");
}
static cuphy::unique_device_ptr<uint8_t> d_ldpc_desc = cuphy::make_unique_device<uint8_t>(desc_size);
static cuphy::unique_pinned_ptr<uint8_t> h_ldpc_desc = cuphy::make_unique_pinned<uint8_t>(desc_size);
static cuphy::unique_device_ptr<uint8_t> d_workspace = cuphy::make_unique_device<uint8_t>(workspace_size);
static cuphy::unique_pinned_ptr<uint8_t> h_workspace = cuphy::make_unique_pinned<uint8_t>(workspace_size);
cudaStream_t cuda_strm = 0;
s = cuphySetupLDPCEncode(ldpc_hndl.get(),    // launch config (output)
                        d_in_tensor.desc().handle(), // source descriptor
                        d_in_tensor.addr(),       // source address
                        d_out_tensor.desc().handle(), // destination descriptor
                        d_out_tensor.addr(),       // destination address
                        BG,                // base graph
                        Zc,                // lifting size
                        true,          // puncture output bits
                        maxParityNodes,    // max parity nodes
                        rv,                // redundancy version
                        0,
                        1,
                        nullptr,
                        nullptr,
                        h_workspace.get(),
                        d_workspace.get(),
                        h_ldpc_desc.get(), // host descriptor
                        d_ldpc_desc.get(), // device descriptor
                        1,                // do async copy during setup
                        cuda_strm);

if(CUPHY_STATUS_SUCCESS != s)
{
   throw cuphy::cuphy_fn_exception(s, "cuphySetupLDPCEncode()");
}

launch_kernel(ldpc_hndl.get()->m_kernelNodeParams, cuda_strm);

CUDA_CHECK_EXCEPTION(cudaStreamSynchronize(cuda_strm));

int block_size = 1024;
int size = (N+N_padding)/8*C;
int grid_size = (size + block_size - 1) / block_size;
convertToBinary_8block<<<grid_size, block_size>>>((unsigned char*)d_out_tensor.addr(), gpuOutput, size);

for (int i = 0; i < C; i++)
{
   CUDA_CHECK(cudaMemcpy(output, &gpuOutput[(N+N_padding)*i], (N)* sizeof(unsigned char), cudaMemcpyDeviceToHost));
}

cudaFree(gpuOutput);
cudaFree(gpuInput);
}

```

**代码说明：**

输入参数中，Zc是lift size, Kb是基图的信息比特所占列数，block_length是CB块长度，j和n_segments是用来计算每一次编码开头段和结束段，即每一次编码CB数（OAI中最大一次编码8个CB，GPU编码暂且按照这个标准，但是GPU编码并没有具体限制）。

开始先申明所需要的变量，其中包括GPU编码所需要的一些变量声明。在aerial中，GPU变量均是tensor_device形式，这种变量才能直接访问GPU内存，而不需要进行CPU到GPU的转换。

```c++
cuphy::tensor_device d_in_tensor(CUPHY_BIT, K, C);
cuphy::tensor_device d_out_tensor(CUPHY_BIT, N, C);
```

输入的是二维数组，这里需要将其变为一维再传递给GPU，即

```C++
for(int j = 0; j < C; j++)

{

CUDA_CHECK(cudaMemcpy(&gpuInput, input, (K+K_padding)/8, cudaMemcpyHostToDevice));

}
```

由于GPU输入不是按照顺序字节输入，即不是大端模式，而是小端模式，高字节放在高地址，低字节放在低地址，但是OAI中是大端，所以输入的信息比特需要按照每8比特翻转一次，即（OAI中以unsigned char数组存储，所以只需对数组中的元素进行比特翻转即可）。即代码中的

```C++

int block = 1024;

int size_h = (K+K_padding)*C;

int grid = (size_h + block - 1) / block;

reverse_bit<<<grid, block>>>((uint8_t*)d_in_tensor.addr(), gpuInput, size_h);
```

上述代码是使用CUDA C写的，是专门为GPU编程的（有兴趣可以了解）。

后面一直到

```c++
launch_kernel(ldpc_hndl.get()->m_kernelNodeParams, 0);
```

都是GPU编码前的一些准备，这些参数不需要改变，保持就行。而真正运行GPU LDPC编码的就是上述内核函数（如果想要看GPU如何编码，可以参见aerial/cuPHY/src/cuphy/error_correction/ldpc_encode.cu 140行ldpc_encode_in_bit_kernel函数）。

顺便说一下，GPU编译码的代码与matlab中nrLDPCEncode和nrLDPCDecode函数编译码方式相同，也可以参考。

编好的码字放在d_out_tensor中，在此变量中，码字是按照逐比特的方式来存放的，当我们以unsigned char类型将码字从GPU中拷到CPU中时，得到的是每8个比特的结果，因此需要将其改为逐字节的形式，此时得到的结果就是每一个比特占1个字节（比如在GPU中是255，需要将其变为11111111，这样就将1个unsigned char元素变成了8个）。此部分也使用CUDA编程，即

```c++
int block_size = 1024;

int size = (N+N_padding)/8*C;

int grid_size = (size + block_size - 1) / block_size;

convertToBinary_8block<<<grid_size, block_size>>>((unsigned char*)d_out_tensor.addr(), gpuOutput, size);
```

最后将数据拷到CPU，即

```c++

for (int i = 0; i < C; i++)

{

CUDA_CHECK(cudaMemcpy(output, &gpuOutput[(N+N_padding)*i], (N)* sizeof(unsigned char), cudaMemcpyDeviceToHost));

}
```

页: [1]

开源无线网络-OpenXG's Archiver

GPU LDPC 编码