From a1c96ae7901d8b24558a856a7847b9dcb4d6a84a Mon Sep 17 00:00:00 2001 From: inter Date: Sun, 21 Sep 2025 20:19:11 +0800 Subject: [PATCH] Add File --- .../pointnet2_stack/src/group_points_gpu.cu | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 pcdet/ops/pointnet2/pointnet2_stack/src/group_points_gpu.cu diff --git a/pcdet/ops/pointnet2/pointnet2_stack/src/group_points_gpu.cu b/pcdet/ops/pointnet2/pointnet2_stack/src/group_points_gpu.cu new file mode 100644 index 0000000..62e341e --- /dev/null +++ b/pcdet/ops/pointnet2/pointnet2_stack/src/group_points_gpu.cu @@ -0,0 +1,125 @@ +/* +Stacked-batch-data version of point grouping, modified from the original implementation of official PointNet++ codes. +Written by Shaoshuai Shi +All Rights Reserved 2019-2020. +*/ + + +#include +#include + +#include "cuda_utils.h" +#include "group_points_gpu.h" + + +__global__ void group_points_grad_kernel_stack(int B, int M, int C, int N, int nsample, + const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features) { + // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward + // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with + // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with + // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with + // :return: + // grad_features: (N1 + N2 ..., C) gradient of the features + int index = blockIdx.x * blockDim.x + threadIdx.x; + int sample_idx = index % nsample; + int C_idx = (index / nsample) % C; + int pt_idx = (index / nsample / C); + + if (pt_idx >= M || C_idx >= C || sample_idx >= nsample) return; + + int bs_idx = 0, pt_cnt = idx_batch_cnt[0]; + for (int k = 1; k < B; k++){ + if (pt_idx < pt_cnt) break; + pt_cnt += idx_batch_cnt[k]; + bs_idx = k; + } + + int features_batch_start_idx = 0; + for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k]; + + grad_out += pt_idx * C * nsample + C_idx * nsample + sample_idx; + idx += pt_idx * nsample + sample_idx; + grad_features += (features_batch_start_idx + idx[0]) * C + C_idx; + + atomicAdd(grad_features, grad_out[0]); +} + +void group_points_grad_kernel_launcher_stack(int B, int M, int C, int N, int nsample, + const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features) { + // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward + // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with + // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with + // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with + // :return: + // grad_features: (N1 + N2 ..., C) gradient of the features + + cudaError_t err; + // dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) + dim3 blocks(DIVUP(M * C * nsample, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) + dim3 threads(THREADS_PER_BLOCK); + + group_points_grad_kernel_stack<<>>(B, M, C, N, nsample, grad_out, idx, idx_batch_cnt, features_batch_cnt, grad_features); + + err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); + exit(-1); + } +} + + +__global__ void group_points_kernel_stack(int B, int M, int C, int nsample, + const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out) { + // :param features: (N1 + N2 ..., C) tensor of features to group + // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with + // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with + // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with + // :return: + // output: (M1 + M2, C, nsample) tensor + int index = blockIdx.x * blockDim.x + threadIdx.x; + int sample_idx = index % nsample; + int C_idx = (index / nsample) % C; + int pt_idx = (index / nsample / C); + + if (pt_idx >= M || C_idx >= C || sample_idx >= nsample) return; + + int bs_idx = 0, pt_cnt = idx_batch_cnt[0]; + for (int k = 1; k < B; k++){ + if (pt_idx < pt_cnt) break; + pt_cnt += idx_batch_cnt[k]; + bs_idx = k; + } + + int features_batch_start_idx = 0; + for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k]; + features += features_batch_start_idx * C; + + idx += pt_idx * nsample + sample_idx; + int in_idx = idx[0] * C + C_idx; + int out_idx = pt_idx * C * nsample + C_idx * nsample + sample_idx; + + out[out_idx] = features[in_idx]; +} + + +void group_points_kernel_launcher_stack(int B, int M, int C, int nsample, + const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out) { + // :param features: (N1 + N2 ..., C) tensor of features to group + // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with + // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with + // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with + // :return: + // output: (M1 + M2, C, nsample) tensor + + cudaError_t err; + dim3 blocks(DIVUP(M * C * nsample, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) + dim3 threads(THREADS_PER_BLOCK); + + group_points_kernel_stack<<>>(B, M, C, nsample, features, features_batch_cnt, idx, idx_batch_cnt, out); + // cudaDeviceSynchronize(); // for using printf in kernel function + err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); + exit(-1); + } +}