Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LLM INFER] Append attn Moved to https:/PaddlePaddle/PaddleNLP/pull/9244 #9242

Closed
wants to merge 33 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
b072465
append_attention 0914
yuanlehome Sep 14, 2024
b915f95
paddle::empty to phi::allocator
yuanlehome Sep 14, 2024
9b1e1d8
Merge branch 'develop' of https:/PaddlePaddle/PaddleNLP i…
yuanlehome Sep 19, 2024
140a509
append_attn 0919
yuanlehome Sep 20, 2024
5272b6f
0920 fix split_kv_block
yuanlehome Sep 20, 2024
a42157d
my change for merge 4 to 1
yuanlehome Sep 23, 2024
bec8eef
fix prev
yuanlehome Sep 23, 2024
8dab056
merge zhenyun 0923
yuanlehome Sep 23, 2024
d5047b5
fix prev
yuanlehome Sep 23, 2024
006a467
fix var name
yuanlehome Sep 23, 2024
73e2c06
update
yuanlehome Sep 23, 2024
a8acb2b
fix config
yuanlehome Sep 24, 2024
ec46a89
fix
yuanlehome Sep 24, 2024
cb02ee5
fix append_attn
lizhenyun01 Sep 27, 2024
83a19a6
Merge branch 'develop' of https:/PaddlePaddle/PaddleNLP i…
yuanlehome Sep 27, 2024
37fc7da
fix --use_fake_parameter
yuanlehome Sep 27, 2024
a3b265b
refine paddle::empty(), fix memory error, support multi_stream for at…
yuanlehome Sep 29, 2024
68a09b6
fix and rename attention as append_attention
yuanlehome Sep 29, 2024
2bcd939
rename file
yuanlehome Sep 29, 2024
74941a0
fix
yuanlehome Sep 29, 2024
19a0bdb
encoder GQANEOX rope support
lizhenyun01 Oct 8, 2024
a9078cb
decoder a8w8c8 GQANEOX rope support
lizhenyun01 Oct 8, 2024
f64f962
merge get_block_shape and split_kv_block
yuanlehome Oct 8, 2024
7ba73f8
bf16 neox rope support
lizhenyun01 Oct 9, 2024
6837c23
fix diff
lizhenyun01 Oct 9, 2024
0a5ae96
separate compilation
lizhenyun01 Oct 9, 2024
e9cfc55
manual destroy stream
lizhenyun01 Oct 9, 2024
478c517
fix multi stream
yuanlehome Oct 10, 2024
aa1e96a
Merge branch 'develop' of https:/PaddlePaddle/PaddleNLP i…
yuanlehome Oct 10, 2024
e8ddfe8
qwen/llama support weightonly
yuanlehome Oct 10, 2024
8798938
fix multi stream
yuanlehome Oct 10, 2024
f6a64d0
qwen-moe and mixtral support append_attn
yuanlehome Oct 10, 2024
2292780
refine code
yuanlehome Oct 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
792 changes: 792 additions & 0 deletions csrc/gpu/append_attention.cu

Large diffs are not rendered by default.

51 changes: 51 additions & 0 deletions csrc/gpu/append_attn/append_attention_bfloat16_bfloat16_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "append_attention_kernel.h"

template void CascadeAppendAttentionKernel<paddle::bfloat16, paddle::bfloat16>(
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor& cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor& cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>& cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>& cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>& cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>& cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>& shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>& smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const std::string& cache_quant_type_str,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const int num_heads,
const int kv_num_heads,
const int head_dim,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
51 changes: 51 additions & 0 deletions csrc/gpu/append_attn/append_attention_bfloat16_int8_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "append_attention_kernel.h"

template void CascadeAppendAttentionKernel<paddle::bfloat16, int8_t>(
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor& cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor& cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>& cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>& cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>& cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>& cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>& shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>& smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const std::string& cache_quant_type_str,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const int num_heads,
const int kv_num_heads,
const int head_dim,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
Loading
Loading