bigbing 发表于 2023-6-22 17:19:35

GPU LDPC译码

LDPC译码部分在 aerial/cuPHY/examples/error_correction/cuphy_ex_ldpc.cpp中。新建的文件(你自己建立)需要放在aerial/cuPHY/examples/error_correction,文件的名称后缀名需要以.cu结尾,否则不能运行CUDA编写的函数。以下是程序具体内容:\#include <stdio.h>

\#include <limits>

\#include "ldpc_decode_test_vec_gen.hpp"

\#include "cuphy.h"

\#include <stdlib.h>

\#include <string>

\#include <iostream>

\#include <vector>

\#include <getopt.h>

\#include <bitset>

\#include "cuphy.hpp"

\#include "cuphy_hdf5.hpp"

\#include "hdf5hpp.hpp"

\#include <immintrin.h>

\#include <emmintrin.h>

\#include <cuda_fp16.h>

using namespace cuphy;



\#define REVERSE_BIT(dat) \

(((dat & 0x01) << 7) | ((dat & 0x02) << 5) | ((dat & 0x04) << 3) | ((dat & 0x08) << 1) | \

((dat & 0x10) >> 1) | ((dat & 0x20) >> 3) | ((dat & 0x40) >> 5) | ((dat & 0x80) >> 7))



__global__ void reverse_bit_8block_D(int8_t *input, int8_t *output, int size)

{

int tid = blockIdx.x * blockDim.x + threadIdx.x;

if(tid < size)

{

•    output = REVERSE_BIT(input);

}

}

__global__ void float_to_half(half *H, int8_t* SI, int size)

{

int tid = blockIdx.x * blockDim.x + threadIdx.x;

if(tid < size)

{

•    float b = (float)SI;

•    H = __float2half(b);

}

}

__global__ void convertToBinary_8block_D(unsigned char *input, unsigned char *output, int size)

{

int tid = blockIdx.x * blockDim.x + threadIdx.x;

if(tid < size)

{

•    unsigned char value = input;

•    for(int i=0; i<=7; i++)

•    {

•      output = (value & (1<<i)) ? 1 : 0;

•   

•    }

}

}



// get_QAM_log2_D()

int get_QAM_log2_D(const char* str)

{

if(0 == strcmp(str, "QAM256"))

{

•    return CUPHY_QAM_256;

}

else if(0 == strcmp(str, "QAM64"))

{

•    return CUPHY_QAM_64;

}

else if(0 == strcmp(str, "QAM16"))

{

•    return CUPHY_QAM_16;

}

else if(0 == strcmp(str, "QPSK"))

{

•    return CUPHY_QAM_4;

}

else if(0 == strcmp(str, "BPSK"))

{

•    return CUPHY_QAM_2;

}

else

{

•    std::runtime_error(std::string("Invalid modulation: ") + str);

•    return 0;

}

}



void gpu_LDPCDecode_single(int8_t* input, int8_t* output, int BG, int Kb, int Z, float code_rate, int numMaxIter, int K, const char* str)

{

int      numIterations   = numMaxIter;

int      algoIndex       = 0;

float   minSumNorm      = 0.0f;

int      log2QAM      = get_QAM_log2_D(str);

bool   chooseHighThroughput = false;

int N = std::lroundf(K / code_rate);

// tv.config().num_cw = C;

int K_padding = (K % 32 != 0) * (32 - (K % 32));



// N: Number of modulated bits

// B: Input block size

// mb: number of parity nodes

// P: punctured parity bits (0 <= P < Z)

//

// N = B + Z(mb - 2) - P

// N - B       P

// ----- + 2 = mb - ---

//Z      Z

//

//

// 0 <= P/Z < 1

//

// mb = ceil(2 + (N - B)/Z) = ceil((2Z + N - B) / Z)

int mb = static_cast<int>(std::ceil((N - K + (2 * Z)) / static_cast<float>(Z)));



const int       V      = mb + ((1 == BG) ? 22 : 10);

const int       NUM_SYMBOLS   = (Z*V + log2QAM - 1) / log2QAM;

const int       NUM_LLR = NUM_SYMBOLS * log2QAM;

int NUM_padding = (NUM_LLR % 32 != 0) * (32 - (NUM_LLR % 32));

cuphy::tensor_device tLLR_16(CUPHY_R_16F, NUM_LLR);

tensor_device tDecode(CUPHY_BIT, K);



int8_t* gpuOutput;

int8_t* gpuInput;

cudaMalloc((void**)&gpuInput, (NUM_LLR+NUM_padding) * sizeof(int8_t));

cudaMalloc((void**)&gpuOutput, (K+K_padding)/8 * sizeof(int8_t));



CUDA_CHECK(cudaMemcpy(gpuInput, input, NUM_LLR, cudaMemcpyHostToDevice));

int block = 1024;

int size_h = NUM_LLR+NUM_padding;

int grid = (size_h + block - 1) / block;

float_to_half<<<grid, block>>>((half*)tLLR_16.addr(), gpuInput, size_h);



// Create an LDPC decoder instance

static cuphy::context ctx;

static cuphy::LDPC_decoder dec(ctx);

//--------------------------------------------------------------

// Initialize an LDPC decode configuration. This is used for

// both the tensor and transport block interfaces.

uint32_t decode_flags = chooseHighThroughput ? CUPHY_LDPC_DECODE_CHOOSE_THROUGHPUT : 0;



cuphy::LDPC_decode_config dec_cfg(CUPHY_R_16F,    // LLR type (fp16 or fp32)

•                  mb,   // num parity nodes

•                  Z,    // lifting size

•                  numIterations, // max num iterations

•                  Kb,   // info nodes

•                  minSumNorm,   // normalization value

•                  decode_flags,// flags

•                  BG,   // base graph

•                  algoIndex,   // algorithm index

•                  nullptr);    // workspace address

//--------------------------------------------------------------

// If no normalization value was provided, query the library for

// an appropriate value.

if(minSumNorm <= 0.0f)

{

    dec.set_normalization(dec_cfg);

}

//--------------------------------------------------------------

// Initialize an LDPC decode descriptor structure. (This is only

// used when the transport block interface is selected.)

LDPC_decode_desc dec_desc(dec_cfg);

//--------------------------------------------------------------

// Initialize an LDPC decode tensor params structure. (This is

// only used when the tensor-based decoder interface is selected.)

LDPC_decode_tensor_params dec_tensor(dec_cfg,         // LDPC configuration

•                      tDecode.desc().handle(), // output descriptor

•                      tDecode.addr(),      // output address

•                      tLLR_16.desc().handle(),// LLR descriptor

•                      tLLR_16.addr());      // LLR address



dec.decode(dec_tensor);

int block_size = 1024;

int size = (K+K_padding)/8;

int grid_size = (size + block_size - 1) / block_size;

reverse_bit_8block_D<<<grid_size, block_size>>>((int8_t*)tDecode.addr(), gpuOutput, size);

CUDA_CHECK(cudaMemcpy(output, gpuOutput, K* sizeof(int8_t)/8, cudaMemcpyDeviceToHost));

cudaFree(gpuOutput);

cudaFree(gpuInput);

}

这里只讲解与译码相关部分,其余与编码部分类似,不再赘述。首先是输入参数,其中code_rate是码率,据此与block_length(即B)计算输入的总比特数。numMaxIter是译码迭代的最大次数,F是填充比特(由于有时B不是恰好等于KbZc,因此就需要填充一部分零使其等于KbZc),str代表的是调制方式(不同调制方式导致不同的符号数)。下面直到tensor_device声明都是一些变量设置,默认即可。先将输入二维数组变为一维并通过cudaMemcpy函数传递给GPU变量gpuInput。 CUDA_CHECK(cudaMemcpy(gpuInput, input, NUM_LLR, cudaMemcpyHostToDevice));
随后直接使用CUDA将输入的int8_t数据类型转变为half类型并赋给d_in_tensor(需要说明,在GPU译码中,输入译码器的格式是半精度浮点数,只有16位比特,相比于单精度的32位,这样做可以大大节省存储空间)。即int block = 1024;

int size_h = NUM_LLR+NUM_padding;

int grid = (size_h + block - 1) / block;

float_to_half<<<grid, block>>>((half*)tLLR_16.addr(), gpuInput, size_h);
此函数结构也很简单,如下:__global__ void float_to_half(half *H, int8_t* SI, int size)

{

int tid = blockIdx.x * blockDim.x + threadIdx.x;

if(tid < size)

{

•    float b = (float)SI;

•    H = __float2half(b);

}

}

先将int8_t变为float,再将float使用__float2half函数变为half类型,这样使用CUDA来进行转变速度非常快。随后就是一些GPU译码的参数设置,同样保持不变就行。最后将其从GPU中拷出,译码就完成了。
页: [1]
查看完整版本: GPU LDPC译码