From e9e535bf98b8e5557e716f69abefa4eafc40d91d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=81=E5=B0=8F=E5=B8=85?= <56024577+dingxiaoshuai123@users.noreply.github.com> Date: Mon, 6 May 2024 17:46:09 +0800 Subject: [PATCH] feat: Merge branch import_braft (#300) * feat:import braft (#130) * feat: Implementing Redis-Raft commands to build a cluster. (#136) Co-authored-by: panlei-coder Co-authored-by: century <919745273@qq.com> Co-authored-by: panlei-coder <62509266+panlei-coder@users.noreply.github.com> Co-authored-by: alexstocks Co-authored-by: dingxiaoshuai123 <2486016589@qq.com> Co-authored-by: longfar --- .github/workflows/import_braft.yml | 60 ++ .github/workflows/pikiwidb.yml | 2 + CMakeLists.txt | 82 ++- README_CN.md | 1 - build.sh | 8 +- cmake/braft.cmake | 46 ++ cmake/brpc.cmake | 49 ++ cmake/findTools.cmake | 52 +- cmake/gflags.cmake | 22 +- cmake/gtest.cmake | 3 +- cmake/leveldb.cmake | 16 +- cmake/libevent.cmake | 4 + cmake/llhttp.cmake | 6 + cmake/openssl.cmake | 26 + cmake/protobuf.cmake | 169 +++++ cmake/rocksdb.cmake | 2 +- cmake/zlib.cmake | 41 ++ pikiwidb.conf | 10 +- save_load.sh | 18 + src/CMakeLists.txt | 28 +- src/base_cmd.h | 13 +- src/client.cc | 45 +- src/client.h | 6 +- src/cmd_admin.cc | 96 +++ src/cmd_admin.h | 16 +- src/cmd_raft.cc | 231 +++++++ src/cmd_raft.h | 83 +++ src/cmd_set.cc | 1 + src/cmd_set.h | 1 + src/cmd_table_manager.cc | 11 +- src/cmd_thread_pool.h | 2 +- src/config.cc | 3 +- src/config.h | 6 + src/db.cc | 94 ++- src/db.h | 27 +- src/net/CMakeLists.txt | 4 + src/pikiwidb.cc | 20 +- src/pikiwidb.h | 7 +- src/praft/CMakeLists.txt | 46 ++ src/praft/binlog.proto | 22 + src/praft/praft.cc | 651 ++++++++++++++++++ src/praft/praft.h | 169 +++++ src/praft/praft.proto | 13 + src/praft/praft_service.h | 26 + src/praft/psnapshot.cc | 102 +++ src/praft/psnapshot.h | 36 + src/pstd/CMakeLists.txt | 1 + src/pstd/pstd_string.cc | 6 + src/pstd/pstd_string.h | 2 + src/pstd/thread_pool.h | 10 +- src/replication.cc | 21 +- src/replication.h | 10 +- src/storage/CMakeLists.txt | 31 +- src/storage/include/storage/storage.h | 37 +- src/storage/include/storage/storage_define.h | 1 + src/storage/src/batch.h | 112 +++ src/storage/src/debug.h | 3 - src/storage/src/log_index.cc | 194 ++++++ src/storage/src/log_index.h | 256 +++++++ src/storage/src/redis.cc | 38 +- src/storage/src/redis.h | 26 + src/storage/src/redis_hashes.cc | 27 +- src/storage/src/redis_lists.cc | 26 +- src/storage/src/redis_sets.cc | 25 +- src/storage/src/redis_strings.cc | 7 +- src/storage/src/redis_zsets.cc | 19 +- src/storage/src/storage.cc | 225 +++++- src/storage/src/storage_murmur3.h | 2 +- src/storage/tests/CMakeLists.txt | 30 + src/storage/tests/flush_oldest_cf_test.cc | 484 +++++++++++++ src/storage/tests/log_index_collector_test.cc | 176 +++++ src/storage/tests/log_index_test.cc | 274 ++++++++ src/store.cc | 51 +- src/store.h | 44 +- tests/consistency_test.go | 325 +++++++++ tests/hash_test.go | 1 - tests/util/pikiwidb.go | 25 +- 77 files changed, 4626 insertions(+), 239 deletions(-) create mode 100644 .github/workflows/import_braft.yml create mode 100644 cmake/braft.cmake create mode 100644 cmake/brpc.cmake create mode 100644 cmake/openssl.cmake create mode 100644 cmake/protobuf.cmake create mode 100644 cmake/zlib.cmake create mode 100755 save_load.sh create mode 100644 src/cmd_raft.cc create mode 100644 src/cmd_raft.h create mode 100644 src/praft/CMakeLists.txt create mode 100644 src/praft/binlog.proto create mode 100644 src/praft/praft.cc create mode 100644 src/praft/praft.h create mode 100644 src/praft/praft.proto create mode 100644 src/praft/praft_service.h create mode 100644 src/praft/psnapshot.cc create mode 100644 src/praft/psnapshot.h create mode 100644 src/storage/src/batch.h create mode 100644 src/storage/src/log_index.cc create mode 100644 src/storage/src/log_index.h create mode 100644 src/storage/tests/CMakeLists.txt create mode 100644 src/storage/tests/flush_oldest_cf_test.cc create mode 100644 src/storage/tests/log_index_collector_test.cc create mode 100644 src/storage/tests/log_index_test.cc create mode 100644 tests/consistency_test.go diff --git a/.github/workflows/import_braft.yml b/.github/workflows/import_braft.yml new file mode 100644 index 000000000..82e754f8d --- /dev/null +++ b/.github/workflows/import_braft.yml @@ -0,0 +1,60 @@ +name: Import BRaft Actions (Temporary) + +on: + push: + pull_request: + branches: [ "import-braft" ] + +jobs: + check_format: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Build + run: bash ci/build.sh + + - name: Check Format + working-directory: ${{ github.workspace }}/build + run: make check-format + + build_on_macos: + runs-on: macos-latest + needs: check_format + + steps: + - uses: actions/checkout@v4 + + - name: Build + env: + CPLUS_INCLUDE_PATH: /opt/homebrew/include + run: | + brew install autoconf + brew install go + sh build.sh + + - name: Run Go E2E Tests + working-directory: ${{ github.workspace }}/build + run: | + cd ../tests + go mod tidy + go test ./pikiwidb_suite_test.go ./consistency_test.go -v + + build_on_ubuntu: + runs-on: ubuntu-latest + needs: check_format + + steps: + - uses: actions/checkout@v4 + + - name: Build + run: | + bash build.sh + + - name: Run Go E2E Tests + working-directory: ${{ github.workspace }}/build + run: | + cd ../tests + go mod tidy + go test ./pikiwidb_suite_test.go ./consistency_test.go -v diff --git a/.github/workflows/pikiwidb.yml b/.github/workflows/pikiwidb.yml index 4f37da405..976862951 100644 --- a/.github/workflows/pikiwidb.yml +++ b/.github/workflows/pikiwidb.yml @@ -27,6 +27,8 @@ jobs: - uses: actions/checkout@v4 - name: Build + env: + CPLUS_INCLUDE_PATH: /opt/homebrew/include run: | brew install autoconf brew install go diff --git a/CMakeLists.txt b/CMakeLists.txt index 02e794285..a352f0600 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,12 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + CMAKE_MINIMUM_REQUIRED(VERSION 3.14) PROJECT(PikiwiDB) -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated -g -D'GIT_COMMIT_ID=\"${GIT_COMMIT_ID}\"'") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type -D'GIT_COMMIT_ID=\"${GIT_COMMIT_ID}\"'") # Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24: IF (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") @@ -35,7 +40,7 @@ EXECUTE_PROCESS(COMMAND git rev-parse HEAD OUTPUT_VARIABLE GIT_COMMIT_ID OUTPUT_ ADD_DEFINITIONS(-DKPIKIWIDB_GIT_COMMIT_ID="${GIT_COMMIT_ID}") MESSAGE(STATUS "Git commit id: ${GIT_COMMIT_ID}") -############# You should enable sanitizer if you are developing pika ############# +############# You should enable sanitizer if you are developing pikiwidb ############# # Uncomment the following two lines to enable AddressSanitizer to detect memory leaks and other memory-related bugs. # SET(CMAKE_BUILD_TYPE "Debug") # SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -O0 -fno-omit-frame-pointer -fno-optimize-sibling-calls") @@ -78,7 +83,7 @@ ELSEIF (CMAKE_SYSTEM_NAME MATCHES "Linux") set(CMAKE_THREAD_LIBS_INIT "-lpthread") ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") SET(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++") - SET(CMAKE_CXX_FLAGS "-pthread -Wl,--no-as-needed -ldl -Wno-restrict") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -Wl,--no-as-needed -ldl") ENDIF () ADD_DEFINITIONS(-DOS_LINUX) ELSE () @@ -101,28 +106,80 @@ SET(INSTALL_LIBDIR ${STAGED_INSTALL_PREFIX}/lib) SET(INSTALL_LIBDIR_64 ${STAGED_INSTALL_PREFIX}/lib64) SET(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${STAGED_INSTALL_PREFIX}) SET(BUILD_SUPPORT_DIR ${PROJECT_SOURCE_DIR}/build_support) +# make sure we use the same compiler for all dependencies +SET(CMAKE_POSITION_INDEPENDENT_CODE ON) MESSAGE(STATUS "${PROJECT_NAME} staged install: ${STAGED_INSTALL_PREFIX}") MESSAGE(STATUS "Current platform: ${OS_VERSION} ") CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORE QUERY NUMBER_OF_LOGICAL_CORES) MESSAGE(STATUS "CPU core ${CPU_CORE}") +#openssl +FIND_PACKAGE(OpenSSL REQUIRED) + +MESSAGE(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY}) +MESSAGE(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY}) + +ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY}) + +ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY}) + +SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") + +SET(THIRD_PARTY_PATH ${CMAKE_CURRENT_BINARY_DIR}/third-party) +SET(THIRD_PARTY_BUILD_TYPE ${CMAKE_BUILD_TYPE}) +SET(EXTERNAL_PROJECT_LOG_ARGS + LOG_DOWNLOAD 0 + LOG_UPDATE 1 + LOG_CONFIGURE 0 + LOG_BUILD 0 + LOG_TEST 1 + LOG_INSTALL 0) + +IF(CMAKE_SYSTEM_NAME STREQUAL "Linux") + SET(LIB rt) +ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + SET(LIB + pthread + "-framework CoreFoundation" + "-framework CoreGraphics" + "-framework CoreData" + "-framework CoreText" + "-framework Security" + "-framework Foundation" + "-Wl,-U,_MallocExtension_ReleaseFreeMemory" + "-Wl,-U,_ProfilerStart" + "-Wl,-U,_ProfilerStop" + "-Wl,-U,__Z13GetStackTracePPvii") +ENDIF() +SET(LIB ${LIB} CACHE INTERNAL "libs which should be linked for executable target") INCLUDE(FetchContent) -#include(cmake/CmakeLists.txt) -include(cmake/findTools.cmake) -include(cmake/libevent.cmake) -include(cmake/llhttp.cmake) -include(cmake/fmt.cmake) -include(cmake/spdlog.cmake) -include(cmake/gtest.cmake) -include(cmake/rocksdb.cmake) +INCLUDE(cmake/gflags.cmake) +INCLUDE(cmake/findTools.cmake) +INCLUDE(cmake/leveldb.cmake) +INCLUDE(cmake/libevent.cmake) +INCLUDE(cmake/llhttp.cmake) +INCLUDE(cmake/fmt.cmake) +INCLUDE(cmake/spdlog.cmake) +INCLUDE(cmake/gtest.cmake) +INCLUDE(cmake/rocksdb.cmake) +INCLUDE(cmake/zlib.cmake) +INCLUDE(cmake/protobuf.cmake) +INCLUDE(cmake/brpc.cmake) +INCLUDE(cmake/braft.cmake) + +ENABLE_TESTING() -enable_testing() +SET(PROTO_OUTPUT_DIR "${CMAKE_BINARY_DIR}/generated_pb") +FILE(MAKE_DIRECTORY "${PROTO_OUTPUT_DIR}") ADD_SUBDIRECTORY(src/pstd) ADD_SUBDIRECTORY(src/net) +ADD_SUBDIRECTORY(src/praft) ADD_SUBDIRECTORY(src/storage) ADD_SUBDIRECTORY(src) @@ -177,4 +234,3 @@ ADD_CUSTOM_TARGET(cpplint echo '${LINT_FILES}' | xargs -n12 -P8 --linelength=120 --filter=-legal/copyright,-build/header_guard,-runtime/references ) - diff --git a/README_CN.md b/README_CN.md index 45f29f4f9..698e51a62 100644 --- a/README_CN.md +++ b/README_CN.md @@ -69,7 +69,6 @@ PikiwiDB 可以和 Redis 之间进行复制,可以读取 Redis 的 rdb 文件 这些特性 PikiwiDB 都有:-) ## 持久化:内存不再是上限 - RocksDB 可以配置为 PikiwiDB 的持久化存储引擎,可以存储更多的数据。 ## 命令列表 diff --git a/build.sh b/build.sh index 0cf18b5b0..75744b714 100755 --- a/build.sh +++ b/build.sh @@ -6,7 +6,13 @@ C_GREEN="\033[32m" C_END="\033[0m" -BUILD_TYPE=release +BUILD_TIME=$(git log -1 --format=%ai) +BUILD_TIME=${BUILD_TIME: 0: 10} + +COMMIT_ID=$(git rev-parse HEAD) +SHORT_COMMIT_ID=${COMMIT_ID: 0: 8} + +BUILD_TYPE=Release VERBOSE=0 CMAKE_FLAGS="" MAKE_FLAGS="" diff --git a/cmake/braft.cmake b/cmake/braft.cmake new file mode 100644 index 000000000..288c637fe --- /dev/null +++ b/cmake/braft.cmake @@ -0,0 +1,46 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +INCLUDE(ExternalProject) + +SET(BRAFT_SOURCES_DIR ${THIRD_PARTY_PATH}/braft) +SET(BRAFT_INSTALL_DIR ${THIRD_PARTY_PATH}/install/braft) +SET(BRAFT_INCLUDE_DIR "${BRAFT_INSTALL_DIR}/include" CACHE PATH "braft include directory." FORCE) +SET(BRAFT_LIBRARIES "${BRAFT_INSTALL_DIR}/lib/libbraft.a" CACHE FILEPATH "braft library." FORCE) + +SET(prefix_path "${THIRD_PARTY_PATH}/install/brpc|${CMAKE_CURRENT_BINARY_DIR}/_deps/gflags-build|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${CMAKE_CURRENT_BINARY_DIR}/_deps/leveldb-build|${CMAKE_CURRENT_BINARY_DIR}/_deps/leveldb-src") + +ExternalProject_Add( + extern_braft + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS brpc + GIT_REPOSITORY "https://github.com/pikiwidb/braft.git" + GIT_TAG master + PREFIX ${BRAFT_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${BRAFT_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${BRAFT_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${prefix_path} + -DBRPC_WITH_GLOG=OFF + ${EXTERNAL_OPTIONAL_ARGS} + LIST_SEPARATOR | + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRAFT_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${BRAFT_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_IN_SOURCE 1 + BUILD_COMMAND $(MAKE) -j ${CPU_CORE} braft-static + INSTALL_COMMAND mkdir -p ${BRAFT_INSTALL_DIR}/lib/ COMMAND cp ${BRAFT_SOURCES_DIR}/src/extern_braft/output/lib/libbraft.a ${BRAFT_LIBRARIES} COMMAND cp -r ${BRAFT_SOURCES_DIR}/src/extern_braft/output/include ${BRAFT_INCLUDE_DIR}/ +) +ADD_DEPENDENCIES(extern_braft brpc) +ADD_LIBRARY(braft STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET braft PROPERTY IMPORTED_LOCATION ${BRAFT_LIBRARIES}) +ADD_DEPENDENCIES(braft extern_braft) \ No newline at end of file diff --git a/cmake/brpc.cmake b/cmake/brpc.cmake new file mode 100644 index 000000000..fbace60c8 --- /dev/null +++ b/cmake/brpc.cmake @@ -0,0 +1,49 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +INCLUDE(ExternalProject) + +SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) +SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) +SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE) +SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE) + +# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args +SET(prefix_path "${CMAKE_CURRENT_BINARY_DIR}/_deps/gflags-build|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${CMAKE_CURRENT_BINARY_DIR}/_deps/leveldb-build|${CMAKE_CURRENT_BINARY_DIR}/_deps/leveldb-src") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations") +# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF +EXTERNALPROJECT_ADD( + extern_brpc + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ssl crypto zlib protobuf leveldb gflags + URL https://github.com/apache/brpc/archive/refs/tags/1.8.0.tar.gz + URL_HASH SHA256=13ffb2f1f57c679379a20367c744b3e597614a793ec036cd7580aae90798019d + PREFIX ${BRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${prefix_path} + -DWITH_GLOG=OFF + -DDOWNLOAD_GTEST=OFF + ${EXTERNAL_OPTIONAL_ARGS} + LIST_SEPARATOR | + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_IN_SOURCE 1 + BUILD_COMMAND $(MAKE) -j ${CPU_CORE} brpc-static + INSTALL_COMMAND mkdir -p ${BRPC_INSTALL_DIR}/lib/ COMMAND cp ${BRPC_SOURCES_DIR}/src/extern_brpc/output/lib/libbrpc.a ${BRPC_LIBRARIES} COMMAND cp -r ${BRPC_SOURCES_DIR}/src/extern_brpc/output/include ${BRPC_INCLUDE_DIR}/ +) +ADD_DEPENDENCIES(extern_brpc ssl crypto zlib protobuf leveldb gflags) +ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) +ADD_DEPENDENCIES(brpc extern_brpc) diff --git a/cmake/findTools.cmake b/cmake/findTools.cmake index 1373e978b..15c992000 100644 --- a/cmake/findTools.cmake +++ b/cmake/findTools.cmake @@ -1,53 +1,61 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + FIND_PROGRAM(AUTOCONF autoconf PATHS /usr/bin /usr/local/bin) -IF(${AUTOCONF} MATCHES AUTOCONF-NOTFOUND) +IF (${AUTOCONF} MATCHES AUTOCONF-NOTFOUND) MESSAGE(FATAL_ERROR "not find autoconf on localhost") -ENDIF() +ENDIF () FIND_PROGRAM(CLANG_FORMAT_BIN - NAMES clang-format) -IF("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") + NAMES clang-format + HINTS ${CLANG_SEARCH_PATH}) +IF ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") MESSAGE(WARNING "couldn't find clang-format.") -ELSE() +ELSE () MESSAGE(STATUS "found clang-format at ${CLANG_FORMAT_BIN}") -ENDIF() +ENDIF () FIND_PROGRAM(CLANG_TIDY_BIN - NAMES clang-tidy clang-tidy-12 clang-tidy-14) -IF("${CLANG_TIDY_BIN}" STREQUAL "CLANG_TIDY_BIN-NOTFOUND") + NAMES clang-tidy clang-tidy-12 clang-tidy-14 + HINTS ${CLANG_SEARCH_PATH}) +IF ("${CLANG_TIDY_BIN}" STREQUAL "CLANG_TIDY_BIN-NOTFOUND") MESSAGE(WARNING "couldn't find clang-tidy.") -ELSE() +ELSE () MESSAGE(STATUS "found clang-tidy at ${CLANG_TIDY_BIN}") -ENDIF() +ENDIF () FIND_PROGRAM(CPPLINT_BIN NAMES cpplint cpplint.py HINTS "${BUILD_SUPPORT_DIR}") -IF("${CPPLINT_BIN}" STREQUAL "CPPLINT_BIN-NOTFOUND") +IF ("${CPPLINT_BIN}" STREQUAL "CPPLINT_BIN-NOTFOUND") MESSAGE(WARNING "couldn't find cpplint.py") -ELSE() +ELSE () MESSAGE(STATUS "found cpplint at ${CPPLINT_BIN}") -ENDIF() +ENDIF () FIND_PROGRAM(CLANG_APPLY_REPLACEMENTS_BIN - NAMES clang-apply-replacements clang-apply-replacements-12 clang-apply-replacements-14) -IF("${CLANG_APPLY_REPLACEMENTS_BIN}" STREQUAL "CLANG_APPLY_REPLACEMENTS_BIN-NOTFOUND") + NAMES clang-apply-replacements clang-apply-replacements-12 clang-apply-replacements-14 + HINTS ${CLANG_SEARCH_PATH}) +IF ("${CLANG_APPLY_REPLACEMENTS_BIN}" STREQUAL "CLANG_APPLY_REPLACEMENTS_BIN-NOTFOUND") MESSAGE(WARNING "couldn't find clang-apply-replacements.") -ELSE() +ELSE () MESSAGE(STATUS "found clang-apply-replacements at ${CLANG_APPLY_REPLACEMENTS_BIN}") -ENDIF() +ENDIF () OPTION(WITH_COMMAND_DOCS "build with command docs support" OFF) -IF(WITH_COMMAND_DOCS) +IF (WITH_COMMAND_DOCS) ADD_DEFINITIONS(-DWITH_COMMAND_DOCS) -ENDIF() +ENDIF () -IF(${CMAKE_BUILD_TYPE} MATCHES "RELEASE") +IF (${CMAKE_BUILD_TYPE} MATCHES "RELEASE") MESSAGE(STATUS "make RELEASE version") ADD_DEFINITIONS(-DBUILD_RELEASE) SET(BuildType "Release") -ELSE() +ELSE () MESSAGE(STATUS "make DEBUG version") ADD_DEFINITIONS(-DBUILD_DEBUG) SET(BuildType "Debug") -ENDIF() +ENDIF () diff --git a/cmake/gflags.cmake b/cmake/gflags.cmake index 939701148..a144028fc 100644 --- a/cmake/gflags.cmake +++ b/cmake/gflags.cmake @@ -10,15 +10,17 @@ FetchContent_Declare(gflags URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 ) -FetchContent_MakeAvailableWithArgs(gflags - GFLAGS_NAMESPACE=gflags - BUILD_STATIC_LIBS=ON - BUILD_SHARED_LIBS=OFF - BUILD_gflags_LIB=ON - BUILD_gflags_nothreads_LIB=OFF - BUILD_TESTING=OFF -) +SET(GFLAGS_BUILD_STATIC_LIBS ON CACHE BOOL "" FORCE) +SET(GFLAGS_BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE) +SET(GFLAGS_BUILD_gflags_LIB ON CACHE BOOL "" FORCE) +SET(GFLAGS_BUILD_gflags_nothreads_LIB OFF CACHE BOOL "" FORCE) +SET(GFLAGS_BUILD_TESTING OFF CACHE BOOL "" FORCE) +FETCHCONTENT_MAKEAVAILABLE(gflags) + +FIND_PACKAGE(Threads REQUIRED) -find_package(Threads REQUIRED) +TARGET_LINK_LIBRARIES(gflags_static Threads::Threads) -target_link_libraries(gflags_static Threads::Threads) +SET(GFLAGS_INCLUDE_PATH ${CMAKE_CURRENT_BINARY_DIR}/_deps/gflags-build/include) +SET(GFLAGS_LIBRARY ${CMAKE_CURRENT_BINARY_DIR}/_deps/gflags-build/libgflags.a) +SET(GFLAGS_LIB ${CMAKE_CURRENT_BINARY_DIR}/_deps/gflags-build/libgflags.a) \ No newline at end of file diff --git a/cmake/gtest.cmake b/cmake/gtest.cmake index 6c6dfbcca..0c6140910 100644 --- a/cmake/gtest.cmake +++ b/cmake/gtest.cmake @@ -8,5 +8,6 @@ FETCHCONTENT_DECLARE( GIT_REPOSITORY https://github.com/google/googletest.git GIT_TAG v1.14.0 ) -set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + +SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE) FETCHCONTENT_MAKEAVAILABLE(gtest) diff --git a/cmake/leveldb.cmake b/cmake/leveldb.cmake index 8ad9c008e..939df493b 100644 --- a/cmake/leveldb.cmake +++ b/cmake/leveldb.cmake @@ -1,9 +1,19 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +INCLUDE_GUARD() + FETCHCONTENT_DECLARE( - leveldb - GIT_REPOSITORY https://github.com/google/leveldb.git - GIT_TAG main + leveldb + GIT_REPOSITORY https://github.com/google/leveldb.git + GIT_TAG main ) SET(LEVELDB_BUILD_TESTS OFF CACHE BOOL "" FORCE) SET(LEVELDB_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE) SET(LEVELDB_INSTALL OFF CACHE BOOL "" FORCE) FETCHCONTENT_MAKEAVAILABLE(leveldb) + +SET(LEVELDB_INCLUDE_PATH ${CMAKE_CURRENT_BINARY_DIR}/_deps/leveldb-src/include) +SET(LEVELDB_LIB ${CMAKE_CURRENT_BINARY_DIR}/_deps/leveldb-build/libleveldb.a) \ No newline at end of file diff --git a/cmake/libevent.cmake b/cmake/libevent.cmake index 0d7220e71..beaf76a62 100644 --- a/cmake/libevent.cmake +++ b/cmake/libevent.cmake @@ -1,3 +1,7 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. # libevent FETCHCONTENT_DECLARE( diff --git a/cmake/llhttp.cmake b/cmake/llhttp.cmake index b92290e38..a2a455625 100644 --- a/cmake/llhttp.cmake +++ b/cmake/llhttp.cmake @@ -1,3 +1,8 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + # nodejs/llhttp FETCHCONTENT_DECLARE( llhttp @@ -16,4 +21,5 @@ FETCHCONTENT_DECLARE( -DBUILD_SHARED_LIBS=OFF BUILD_COMMAND make -j${CPU_CORE} ) + FETCHCONTENT_MAKEAVAILABLE(llhttp) \ No newline at end of file diff --git a/cmake/openssl.cmake b/cmake/openssl.cmake new file mode 100644 index 000000000..127c1aa70 --- /dev/null +++ b/cmake/openssl.cmake @@ -0,0 +1,26 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +SET(OPENSSL_FETCH_INFO + URL https://www.openssl.org/source/openssl-1.1.1h.tar.gz + URL_HASH SHA256=5c9ca8774bd7b03e5784f26ae9e9e6d749c9da2438545077e6b3d755a06595d9 + ) +SET(OPENSSL_USE_STATIC_LIBS ON) + +FETCHCONTENT_DECLARE( + openssl + GIT_REPOSITORY https://github.com/jc-lab/openssl-cmake.git + GIT_TAG 39af37e0964d71c516da5b1836849dd0a03df7a4 # Change to the latest commit ID +) + +FETCHCONTENT_GETPROPERTIES(openssl) +IF (NOT openssl_POPULATED) + FETCHCONTENT_POPULATE(openssl) + ADD_SUBDIRECTORY(${openssl_SOURCE_DIR} ${openssl_BINARY_DIR}) +ENDIF () + +SET(OPENSSL_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/_deps/openssl_src-src/include) +SET(OPENSSL_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR}/_deps/openssl_src-src) +SET(OPENSSL_CRYPTO_LIBRARY ${CMAKE_CURRENT_BINARY_DIR}/_deps/openssl_src-src) \ No newline at end of file diff --git a/cmake/protobuf.cmake b/cmake/protobuf.cmake new file mode 100644 index 000000000..2754a7fc5 --- /dev/null +++ b/cmake/protobuf.cmake @@ -0,0 +1,169 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +INCLUDE(ExternalProject) +# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp +FIND_PACKAGE(Protobuf QUIET) +MACRO(UNSET_VAR VAR_NAME) + UNSET(${VAR_NAME} CACHE) + UNSET(${VAR_NAME}) +ENDMACRO() + +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(PROTOBUF_FOUND) +UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE) +UNSET_VAR(PROTOBUF_PROTOC_LIBRARY) +UNSET_VAR(PROTOBUF_LITE_LIBRARY) +UNSET_VAR(PROTOBUF_LIBRARY) +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) + +# Print and set the protobuf library information, +# finish this cmake process and exit from this file. +MACRO(PROMPT_PROTOBUF_LIB) + SET(protobuf_DEPS ${ARGN}) + + MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}") + MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}") + MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}") + MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}") + MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}") + INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) + + # Assuming that all the protobuf libraries are of the same type. + IF (${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX}) + SET(protobuf_LIBTYPE STATIC) + ELSEIF (${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$") + SET(protobuf_LIBTYPE SHARED) + ELSE () + MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}") + ENDIF () + + ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY}) + + ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY}) + + ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY}) + + ADD_EXECUTABLE(protoc IMPORTED GLOBAL) + SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE}) + SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) + + FOREACH (dep ${protobuf_DEPS}) + ADD_DEPENDENCIES(protobuf ${dep}) + ADD_DEPENDENCIES(protobuf_lite ${dep}) + ADD_DEPENDENCIES(libprotoc ${dep}) + ADD_DEPENDENCIES(protoc ${dep}) + ENDFOREACH () + + RETURN() +ENDMACRO() + +MACRO(SET_PROTOBUF_VERSION) + EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION) + STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}") +ENDMACRO() + +SET(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") + +IF (NOT "${PROTOBUF_ROOT}" STREQUAL "") + MESSAGE("found system protobuf") + + FIND_PATH(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH) + FIND_LIBRARY(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + FIND_LIBRARY(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + FIND_LIBRARY(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + FIND_PROGRAM(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH) + IF (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) + MESSAGE(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") + SET(PROTOBUF_FOUND TRUE) + SET_PROTOBUF_VERSION() + PROMPT_PROTOBUF_LIB() + ELSE () + MESSAGE(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}") + ENDIF () +ENDIF () + +FUNCTION(build_protobuf TARGET_NAME) + STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}") + SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}) + SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME}) + + SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) + SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) + SET(${TARGET_NAME}_LITE_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + SET(${TARGET_NAME}_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + SET(${TARGET_NAME}_PROTOC_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + SET(${TARGET_NAME}_PROTOC_EXECUTABLE + "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}" + PARENT_SCOPE) + + set(prefix_path "${THIRD_PARTY_PATH}/install/zlib") + + # Make sure zlib's two headers are in your /Path/to/install/include path, + # and delete libz.so which we don't need + IF (CMAKE_SYSTEM_NAME MATCHES "Darwin") + FILE(WRITE ${PROTOBUF_SOURCES_DIR}/src/config.sh + "rm -f ${THIRD_PARTY_PATH}/install/zlib/lib/*.dylib && mkdir -p ${THIRD_PARTY_PATH}/install/protobuf/include && cp ${THIRD_PARTY_PATH}/install/zlib/include/* ${THIRD_PARTY_PATH}/install/protobuf/include/ && ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_FLAGS='${CMAKE_C_FLAGS}' -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_RELEASE='${CMAKE_C_FLAGS_RELEASE}' -DCMAKE_CXX_FLAGS='${CMAKE_CXX_FLAGS}' -DCMAKE_CXX_FLAGS_RELEASE='${CMAKE_CXX_FLAGS_RELEASE}' -DCMAKE_CXX_FLAGS_DEBUG='${CMAKE_CXX_FLAGS_DEBUG}' -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_SKIP_RPATH=ON -Dprotobuf_WITH_ZLIB=ON -DZLIB_INCLUDE_DIR=${THIRD_PARTY_PATH}/install/zlib/include -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_BUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${prefix_path} ${EXTERNAL_OPTIONAL_ARGS}" + ) + ELSEIF (CMAKE_SYSTEM_NAME MATCHES "Linux") + FILE(WRITE ${PROTOBUF_SOURCES_DIR}/src/config.sh + "rm -f ${THIRD_PARTY_PATH}/install/zlib/lib/libz.so* && mkdir -p ${THIRD_PARTY_PATH}/install/protobuf/include && cp ${THIRD_PARTY_PATH}/install/zlib/include/* ${THIRD_PARTY_PATH}/install/protobuf/include/ && ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_FLAGS='${CMAKE_C_FLAGS}' -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_RELEASE='${CMAKE_C_FLAGS_RELEASE}' -DCMAKE_CXX_FLAGS='${CMAKE_CXX_FLAGS}' -DCMAKE_CXX_FLAGS_RELEASE='${CMAKE_CXX_FLAGS_RELEASE}' -DCMAKE_CXX_FLAGS_DEBUG='${CMAKE_CXX_FLAGS_DEBUG}' -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_SKIP_RPATH=ON -Dprotobuf_WITH_ZLIB=ON -DZLIB_INCLUDE_DIR=${THIRD_PARTY_PATH}/install/zlib/include -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_BUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${prefix_path} ${EXTERNAL_OPTIONAL_ARGS}" + ) + ELSE () + MESSAGE(FATAL_ERROR "only support linux or macOS") + ENDIF () + + ExternalProject_Add( + ${TARGET_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PROTOBUF_SOURCES_DIR} + UPDATE_COMMAND "" + DEPENDS zlib + URL "https://github.com/protocolbuffers/protobuf/archive/v3.18.0.tar.gz" + URL_HASH SHA256=14e8042b5da37652c92ef6a2759e7d2979d295f60afd7767825e3de68c856c54 + CONFIGURE_COMMAND mv ../config.sh . COMMAND sh config.sh + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + ${OPTIONAL_CACHE_ARGS} + ) + +ENDFUNCTION() + +SET(PROTOBUF_VERSION 3.18.0) + +IF (NOT PROTOBUF_FOUND) + MESSAGE("build protobuf") + + build_protobuf(extern_protobuf) + + SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR} + CACHE PATH "protobuf include directory." FORCE) + SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY} + CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY} + CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_LIBRARIES ${extern_protobuf_LIBRARY} + CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} + CACHE FILEPATH "protoc library." FORCE) + + SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE} + CACHE FILEPATH "protobuf executable." FORCE) + PROMPT_PROTOBUF_LIB(extern_protobuf zlib) +ENDIF (NOT PROTOBUF_FOUND) + diff --git a/cmake/rocksdb.cmake b/cmake/rocksdb.cmake index 69b4546e7..47cda8075 100644 --- a/cmake/rocksdb.cmake +++ b/cmake/rocksdb.cmake @@ -5,7 +5,7 @@ INCLUDE_GUARD() -FetchContent_Declare( +FETCHCONTENT_DECLARE( rocksdb GIT_REPOSITORY https://github.com/facebook/rocksdb.git GIT_TAG v8.3.3 diff --git a/cmake/zlib.cmake b/cmake/zlib.cmake new file mode 100644 index 000000000..b1e300009 --- /dev/null +++ b/cmake/zlib.cmake @@ -0,0 +1,41 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +INCLUDE(ExternalProject) + +SET(ZLIB_SOURCES_DIR ${THIRD_PARTY_PATH}/zlib) +SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib) +# SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE) +# SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE) + +ExternalProject_Add( + extern_zlib + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/madler/zlib.git" + GIT_TAG "v1.2.8" + PREFIX ${ZLIB_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${ZLIB_INSTALL_DIR}/lib + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_MACOSX_RPATH=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${ZLIB_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) + +SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) + +ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) +ADD_DEPENDENCIES(zlib extern_zlib) \ No newline at end of file diff --git a/pikiwidb.conf b/pikiwidb.conf index 9be37fe90..a88b8d03c 100644 --- a/pikiwidb.conf +++ b/pikiwidb.conf @@ -338,4 +338,12 @@ rocksdb-number-levels 7 rocksdb-enable-pipelined-write no rocksdb-level0-slowdown-writes-trigger 20 rocksdb-level0-stop-writes-trigger 36 - +# default 86400 * 7 +rocksdb-ttl-second 604800 +# default 86400 * 3 +rocksdb-periodic-second 259200; + +############################### RAFT ############################### +use-raft no +# Braft relies on brpc to communicate via the default port number plus the port offset +raft-port-offset 10 diff --git a/save_load.sh b/save_load.sh new file mode 100755 index 000000000..1bcad5b74 --- /dev/null +++ b/save_load.sh @@ -0,0 +1,18 @@ +#!/bin/bash +killall -9 pikiwidb +mkdir leader follower1 + +cd leader && ulimit -n 99999 && rm -fr * && ../bin/pikiwidb ../pikiwidb.conf --port 7777 & +cd follower1 && ulimit -n 99999 && rm -fr * && ../bin/pikiwidb ../pikiwidb.conf --port 8888 & +sleep 5 + +redis-cli -p 7777 raft.cluster init + +redis-benchmark -p 7777 -c 5 -n 10000 -r 10000 -d 1024 -t hset +redis-cli -p 7777 raft.node dosnapshot +redis-cli -p 7777 raft.node dosnapshot + +sleep 10 + + +redis-cli -p 8888 raft.cluster join 127.0.0.1:7777 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d08915c3b..f7cdd7d27 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,17 +1,25 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. AUX_SOURCE_DIRECTORY(. PIKIWIDB_SRC) ADD_EXECUTABLE(pikiwidb ${PIKIWIDB_SRC}) SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) -TARGET_INCLUDE_DIRECTORIES(pikiwidb PRIVATE - ${PROJECT_SOURCE_DIR}/src - ${PROJECT_SOURCE_DIR}/src/pstd - ${PROJECT_SOURCE_DIR}/src/net - ${PROJECT_SOURCE_DIR}/src/storage/include - ${rocksdb_SOURCE_DIR}/ - ${rocksdb_SOURCE_DIR}/include -) +TARGET_INCLUDE_DIRECTORIES(pikiwidb + PRIVATE ${PROJECT_SOURCE_DIR}/src + PRIVATE ${PROJECT_SOURCE_DIR}/src/pstd + PRIVATE ${PROJECT_SOURCE_DIR}/src/net + PRIVATE ${PROJECT_SOURCE_DIR}/src/storage/include + PRIVATE ${rocksdb_SOURCE_DIR}/ + PRIVATE ${rocksdb_SOURCE_DIR}/include + PRIVATE ${BRAFT_INCLUDE_DIR} + PRIVATE ${BRPC_INCLUDE_DIR} + ) -TARGET_LINK_LIBRARIES(pikiwidb net; dl; fmt; storage; rocksdb) -SET_TARGET_PROPERTIES(pikiwidb PROPERTIES LINKER_LANGUAGE CXX) + +TARGET_LINK_LIBRARIES(pikiwidb net; dl; fmt; storage; rocksdb; pstd braft brpc ssl crypto zlib protobuf leveldb gflags z praft praft_pb "${LIB}") + +SET_TARGET_PROPERTIES(pikiwidb PROPERTIES LINKER_LANGUAGE CXX) \ No newline at end of file diff --git a/src/base_cmd.h b/src/base_cmd.h index d0a55242c..280d7d848 100644 --- a/src/base_cmd.h +++ b/src/base_cmd.h @@ -30,13 +30,17 @@ const std::string kCmdNameExists = "exists"; const std::string kCmdNameType = "type"; const std::string kCmdNameExpire = "expire"; const std::string kCmdNameTtl = "ttl"; +const std::string kCmdNamePttl = "pttl"; const std::string kCmdNamePExpire = "pexpire"; const std::string kCmdNameExpireat = "expireat"; const std::string kCmdNamePExpireat = "pexpireat"; const std::string kCmdNamePersist = "persist"; const std::string kCmdNameKeys = "keys"; -const std::string kCmdNamePttl = "pttl"; +// raft cmd +const std::string kCmdNameRaftCluster = "raft.cluster"; +const std::string kCmdNameRaftNode = "raft.node"; + // string cmd const std::string kCmdNameSet = "set"; const std::string kCmdNameGet = "get"; @@ -75,6 +79,7 @@ const std::string kCmdNameFlushall = "flushall"; const std::string kCmdNameAuth = "auth"; const std::string kCmdNameSelect = "select"; const std::string kCmdNameShutdown = "shutdown"; +const std::string kCmdNameInfo = "info"; // hash cmd const std::string kCmdNameHSet = "hset"; @@ -160,6 +165,7 @@ enum CmdFlags { kCmdFlagsModuleNoCluster = (1 << 13), // No cluster mode support kCmdFlagsNoMulti = (1 << 14), // Cannot be pipelined kCmdFlagsExclusive = (1 << 15), // May change Storage pointer, like pika's kCmdFlagsSuspend + kCmdFlagsRaft = (1 << 16), // raft }; enum AclCategory { @@ -183,7 +189,8 @@ enum AclCategory { kAclCategoryDangerous = (1 << 17), kAclCategoryConnection = (1 << 18), kAclCategoryTransaction = (1 << 19), - kAclCategoryScripting = (1 << 20) + kAclCategoryScripting = (1 << 20), + kAclCategoryRaft = (1 << 21), }; /** @@ -295,6 +302,8 @@ class BaseCmd : public std::enable_shared_from_this { uint32_t GetCmdID() const; + bool isExclusive() { return static_cast(flag_ & kCmdFlagsExclusive); } + protected: // Execute a specific command virtual void DoCmd(PClient* client) = 0; diff --git a/src/client.cc b/src/client.cc index 749cc2380..fbb358779 100644 --- a/src/client.cc +++ b/src/client.cc @@ -5,16 +5,19 @@ * of patent rights can be found in the PATENTS file in the same directory. */ +#include "client.h" + #include #include -#include "client.h" +#include "fmt/core.h" +#include "praft/praft.h" +#include "pstd/log.h" +#include "pstd/pstd_string.h" + +#include "base_cmd.h" #include "config.h" -#include "log.h" #include "pikiwidb.h" -#include "pstd_string.h" -#include "slow_log.h" -#include "store.h" namespace pikiwidb { @@ -130,6 +133,10 @@ void CmdRes::SetRes(CmdRes::CmdRet _ret, const std::string& content) { case kInvalidCursor: AppendStringRaw("-ERR invalid cursor"); break; + case kWrongLeader: + AppendStringRaw("-ERR wrong leader"); + AppendStringRaw(content); + AppendStringRaw(CRLF); default: break; } @@ -191,7 +198,6 @@ static int ProcessMaster(const char* start, const char* end) { // discard all requests before sync; // or continue serve with old data? TODO return static_cast(end - start); - case kPReplStateWaitAuth: if (end - start >= 5) { if (strncasecmp(start, "+OK\r\n", 5) == 0) { @@ -261,10 +267,19 @@ int PClient::handlePacket(const char* start, int bytes) { const char* ptr = start; if (isPeerMaster()) { - // check slave state - auto recved = ProcessMaster(start, end); - if (recved != -1) { - return recved; + if (isClusterCmdTarget()) { + // Proccees the packet at one turn. + int len = PRAFT.ProcessClusterCmdResponse(this, start, bytes); // @todo + if (len > 0) { + return len; + } + } else { + // Proccees the packet at one turn. + // check slave state + auto recved = ProcessMaster(start, end); + if (recved != -1) { + return recved; + } } } @@ -412,7 +427,6 @@ PClient::PClient(TcpConnection* obj) int PClient::HandlePackets(pikiwidb::TcpConnection* obj, const char* start, int size) { int total = 0; - while (total < size) { auto processed = handlePacket(start + total, size - total); if (processed <= 0) { @@ -440,6 +454,10 @@ void PClient::OnConnect() { if (g_config.master_auth.empty()) { SetAuth(); } + + if (isClusterCmdTarget()) { + PRAFT.SendNodeRequest(this); + } } else { if (g_config.password.empty()) { SetAuth(); @@ -522,6 +540,10 @@ bool PClient::isPeerMaster() const { return repl_addr.GetIP() == PeerIP() && repl_addr.GetPort() == PeerPort(); } +bool PClient::isClusterCmdTarget() const { + return PRAFT.GetClusterCmdCtx().GetPeerIp() == PeerIP() && PRAFT.GetClusterCmdCtx().GetPort() == PeerPort(); +} + int PClient::uniqueID() const { if (auto c = getTcpConnection(); c) { return c->GetUniqueId(); @@ -686,6 +708,7 @@ void PClient::FeedMonitors(const std::vector& params) { } } } + void PClient::SetKey(std::vector& names) { keys_ = std::move(names); // use std::move clear copy expense } diff --git a/src/client.h b/src/client.h index 7e0940eaa..7f9eb9823 100644 --- a/src/client.h +++ b/src/client.h @@ -13,10 +13,10 @@ #include #include "common.h" +#include "net/tcp_connection.h" #include "proto_parser.h" #include "replication.h" #include "storage/storage.h" -#include "tcp_connection.h" namespace pikiwidb { @@ -48,6 +48,7 @@ class CmdRes { kErrOther, KIncrByOverFlow, kInvalidCursor, + kWrongLeader, }; CmdRes() = default; @@ -202,6 +203,7 @@ class PClient : public std::enable_shared_from_this, public CmdRes { void SetAuth() { auth_ = true; } bool GetAuth() const { return auth_; } void RewriteCmd(std::vector& params) { parser_.SetParams(params); } + void Reexecutecommand() { this->executeCommand(); } inline size_t ParamsSize() const { return params_.size(); } @@ -222,6 +224,8 @@ class PClient : public std::enable_shared_from_this, public CmdRes { bool isPeerMaster() const; int uniqueID() const; + bool isClusterCmdTarget() const; + // TcpConnection's life is undetermined, so use weak ptr for safety. std::weak_ptr tcp_connection_; diff --git a/src/cmd_admin.cc b/src/cmd_admin.cc index b363201b4..e7a99b11e 100644 --- a/src/cmd_admin.cc +++ b/src/cmd_admin.cc @@ -6,7 +6,12 @@ */ #include "cmd_admin.h" + +#include "braft/raft.h" +#include "rocksdb/version.h" + #include "pikiwidb.h" +#include "praft/praft.h" #include "store.h" namespace pikiwidb { @@ -119,4 +124,95 @@ bool PingCmd::DoInitial(PClient* client) { return true; } void PingCmd::DoCmd(PClient* client) { client->SetRes(CmdRes::kPong, "PONG"); } +InfoCmd::InfoCmd(const std::string& name, int16_t arity) + : BaseCmd(name, arity, kCmdFlagsAdmin | kCmdFlagsReadonly, kAclCategoryAdmin) {} + +bool InfoCmd::DoInitial(PClient* client) { return true; } + +// @todo The info raft command is only supported for the time being +void InfoCmd::DoCmd(PClient* client) { + if (client->argv_.size() <= 1) { + return client->SetRes(CmdRes::kWrongNum, client->CmdName()); + } + + auto cmd = client->argv_[1]; + if (!strcasecmp(cmd.c_str(), "RAFT")) { + InfoRaft(client); + } else if (!strcasecmp(cmd.c_str(), "data")) { + InfoData(client); + } else { + client->SetRes(CmdRes::kErrOther, "the cmd is not supported"); + } +} + +/* +* INFO raft +* Querying Node Information. +* Reply: +* raft_node_id:595100767 + raft_state:up + raft_role:follower + raft_is_voting:yes + raft_leader_id:1733428433 + raft_current_term:1 + raft_num_nodes:2 + raft_num_voting_nodes:2 + raft_node1:id=1733428433,state=connected,voting=yes,addr=localhost,port=5001,last_conn_secs=5,conn_errors=0,conn_oks=1 +*/ +void InfoCmd::InfoRaft(PClient* client) { + if (client->argv_.size() != 2) { + return client->SetRes(CmdRes::kWrongNum, client->CmdName()); + } + + if (!PRAFT.IsInitialized()) { + return client->SetRes(CmdRes::kErrOther, "Don't already cluster member"); + } + + auto node_status = PRAFT.GetNodeStatus(); + if (node_status.state == braft::State::STATE_END) { + return client->SetRes(CmdRes::kErrOther, "Node is not initialized"); + } + + std::string message; + message += "raft_group_id:" + PRAFT.GetGroupID() + "\r\n"; + message += "raft_node_id:" + PRAFT.GetNodeID() + "\r\n"; + message += "raft_peer_id:" + PRAFT.GetPeerID() + "\r\n"; + if (braft::is_active_state(node_status.state)) { + message += "raft_state:up\r\n"; + } else { + message += "raft_state:down\r\n"; + } + message += "raft_role:" + std::string(braft::state2str(node_status.state)) + "\r\n"; + message += "raft_leader_id:" + node_status.leader_id.to_string() + "\r\n"; + message += "raft_current_term:" + std::to_string(node_status.term) + "\r\n"; + + if (PRAFT.IsLeader()) { + std::vector peers; + auto status = PRAFT.GetListPeers(&peers); + if (!status.ok()) { + return client->SetRes(CmdRes::kErrOther, status.error_str()); + } + + for (int i = 0; i < peers.size(); i++) { + message += "raft_node" + std::to_string(i) + ":addr=" + butil::ip2str(peers[i].addr.ip).c_str() + + ",port=" + std::to_string(peers[i].addr.port) + "\r\n"; + } + } + + client->AppendString(message); +} + +void InfoCmd::InfoData(PClient* client) { + if (client->argv_.size() != 2) { + return client->SetRes(CmdRes::kWrongNum, client->CmdName()); + } + + std::string message; + message += DATABASES_NUM + std::string(":") + std::to_string(pikiwidb::g_config.databases) + "\r\n"; + message += ROCKSDB_NUM + std::string(":") + std::to_string(pikiwidb::g_config.db_instance_num) + "\r\n"; + message += ROCKSDB_VERSION + std::string(":") + ROCKSDB_NAMESPACE::GetRocksVersionAsString() + "\r\n"; + + client->AppendString(message); +} + } // namespace pikiwidb diff --git a/src/cmd_admin.h b/src/cmd_admin.h index d3093dd7d..e23c570be 100644 --- a/src/cmd_admin.h +++ b/src/cmd_admin.h @@ -104,4 +104,18 @@ class PingCmd : public BaseCmd { void DoCmd(PClient* client) override; }; -} // namespace pikiwidb \ No newline at end of file +class InfoCmd : public BaseCmd { + public: + InfoCmd(const std::string& name, int16_t arity); + + protected: + bool DoInitial(PClient* client) override; + + private: + void DoCmd(PClient* client) override; + + void InfoRaft(PClient* client); + void InfoData(PClient* client); +}; + +} // namespace pikiwidb diff --git a/src/cmd_raft.cc b/src/cmd_raft.cc new file mode 100644 index 000000000..9bfaabc19 --- /dev/null +++ b/src/cmd_raft.cc @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#include "cmd_raft.h" + +#include +#include +#include + +#include "net/event_loop.h" +#include "praft/praft.h" +#include "pstd/log.h" +#include "pstd/pstd_string.h" + +#include "client.h" +#include "config.h" +#include "pikiwidb.h" +#include "replication.h" + +namespace pikiwidb { + +RaftNodeCmd::RaftNodeCmd(const std::string& name, int16_t arity) + : BaseCmd(name, arity, kCmdFlagsRaft, kAclCategoryRaft) {} + +bool RaftNodeCmd::DoInitial(PClient* client) { + auto cmd = client->argv_[1]; + pstd::StringToUpper(cmd); + + if (cmd != kAddCmd && cmd != kRemoveCmd && cmd != kDoSnapshot) { + client->SetRes(CmdRes::kErrOther, "RAFT.NODE supports ADD / REMOVE / DOSNAPSHOT only"); + return false; + } + return true; +} + +void RaftNodeCmd::DoCmd(PClient* client) { + auto cmd = client->argv_[1]; + pstd::StringToUpper(cmd); + if (cmd == kAddCmd) { + DoCmdAdd(client); + } else if (cmd == kRemoveCmd) { + DoCmdRemove(client); + } else if (cmd == kDoSnapshot) { + DoCmdSnapshot(client); + } else { + client->SetRes(CmdRes::kErrOther, "RAFT.NODE supports ADD / REMOVE / DOSNAPSHOT only"); + } +} + +void RaftNodeCmd::DoCmdAdd(PClient* client) { + // Check whether it is a leader. If it is not a leader, return the leader information + if (!PRAFT.IsLeader()) { + client->SetRes(CmdRes::kWrongLeader, PRAFT.GetLeaderID()); + return; + } + + if (client->argv_.size() != 4) { + client->SetRes(CmdRes::kWrongNum, client->CmdName()); + return; + } + + // RedisRaft has nodeid, but in Braft, NodeId is IP:Port. + // So we do not need to parse and use nodeid like redis; + auto s = PRAFT.AddPeer(client->argv_[3]); + if (s.ok()) { + client->SetRes(CmdRes::kOK); + } else { + client->SetRes(CmdRes::kErrOther, fmt::format("Failed to add peer: {}", s.error_str())); + } +} + +void RaftNodeCmd::DoCmdRemove(PClient* client) { + // If the node has been initialized, it needs to close the previous initialization and rejoin the other group + if (!PRAFT.IsInitialized()) { + client->SetRes(CmdRes::kErrOther, "Don't already cluster member"); + return; + } + + if (client->argv_.size() != 3) { + client->SetRes(CmdRes::kWrongNum, client->CmdName()); + return; + } + + // Check whether it is a leader. If it is not a leader, send remove request to leader + if (!PRAFT.IsLeader()) { + // Get the leader information + braft::PeerId leader_peer_id(PRAFT.GetLeaderID()); + // @todo There will be an unreasonable address, need to consider how to deal with it + if (leader_peer_id.is_empty()) { + client->SetRes(CmdRes::kErrOther, + "The leader address of the cluster is incorrect, try again or delete the node from another node"); + return; + } + + // Connect target + std::string peer_ip = butil::ip2str(leader_peer_id.addr.ip).c_str(); + auto port = leader_peer_id.addr.port - pikiwidb::g_config.raft_port_offset; + auto peer_id = client->argv_[2]; + auto ret = + PRAFT.GetClusterCmdCtx().Set(ClusterCmdType::kRemove, client, std::move(peer_ip), port, std::move(peer_id)); + if (!ret) { // other clients have removed + return client->SetRes(CmdRes::kErrOther, "Other clients have removed"); + } + PRAFT.GetClusterCmdCtx().ConnectTargetNode(); + INFO("Sent remove request to leader successfully"); + + // Not reply any message here, we will reply after the connection is established. + client->Clear(); + return; + } + + auto s = PRAFT.RemovePeer(client->argv_[2]); + if (s.ok()) { + client->SetRes(CmdRes::kOK); + } else { + client->SetRes(CmdRes::kErrOther, fmt::format("Failed to remove peer: {}", s.error_str())); + } +} + +void RaftNodeCmd::DoCmdSnapshot(PClient* client) { + auto s = PRAFT.DoSnapshot(); + if (s.ok()) { + client->SetRes(CmdRes::kOK); + } +} + +RaftClusterCmd::RaftClusterCmd(const std::string& name, int16_t arity) + : BaseCmd(name, arity, kCmdFlagsRaft, kAclCategoryRaft) {} + +bool RaftClusterCmd::DoInitial(PClient* client) { + auto cmd = client->argv_[1]; + pstd::StringToUpper(cmd); + if (cmd != kInitCmd && cmd != kJoinCmd) { + client->SetRes(CmdRes::kErrOther, "RAFT.CLUSTER supports INIT/JOIN only"); + return false; + } + return true; +} + +void RaftClusterCmd::DoCmd(PClient* client) { + if (PRAFT.IsInitialized()) { + return client->SetRes(CmdRes::kErrOther, "Already cluster member"); + } + + auto cmd = client->argv_[1]; + pstd::StringToUpper(cmd); + if (cmd == kInitCmd) { + DoCmdInit(client); + } else { + DoCmdJoin(client); + } +} + +void RaftClusterCmd::DoCmdInit(PClient* client) { + if (client->argv_.size() != 2 && client->argv_.size() != 3) { + return client->SetRes(CmdRes::kWrongNum, client->CmdName()); + } + + std::string cluster_id; + if (client->argv_.size() == 3) { + cluster_id = client->argv_[2]; + if (cluster_id.size() != RAFT_GROUPID_LEN) { + return client->SetRes(CmdRes::kInvalidParameter, + "Cluster id must be " + std::to_string(RAFT_GROUPID_LEN) + " characters"); + } + } else { + cluster_id = pstd::RandomHexChars(RAFT_GROUPID_LEN); + } + auto s = PRAFT.Init(cluster_id, false); + if (!s.ok()) { + return client->SetRes(CmdRes::kErrOther, fmt::format("Failed to init node: ", s.error_str())); + } + client->SetRes(CmdRes::kOK); +} + +static inline std::optional> GetIpAndPortFromEndPoint(const std::string& endpoint) { + auto pos = endpoint.find(':'); + if (pos == std::string::npos) { + return std::nullopt; + } + + int32_t ret = 0; + pstd::String2int(endpoint.substr(pos + 1), &ret); + return {{endpoint.substr(0, pos), ret}}; +} + +void RaftClusterCmd::DoCmdJoin(PClient* client) { + // If the node has been initialized, it needs to close the previous initialization and rejoin the other group + if (PRAFT.IsInitialized()) { + return client->SetRes(CmdRes::kErrOther, + "A node that has been added to a cluster must be removed \ + from the old cluster before it can be added to the new cluster"); + } + + if (client->argv_.size() < 3) { + return client->SetRes(CmdRes::kWrongNum, client->CmdName()); + } + + // (KKorpse)TODO: Support multiple nodes join at the same time. + if (client->argv_.size() > 3) { + return client->SetRes(CmdRes::kInvalidParameter, "Too many arguments"); + } + + auto addr = client->argv_[2]; + if (braft::PeerId(addr).is_empty()) { + return client->SetRes(CmdRes::kErrOther, fmt::format("Invalid ip::port: {}", addr)); + } + + auto ip_port = GetIpAndPortFromEndPoint(addr); + if (!ip_port.has_value()) { + return client->SetRes(CmdRes::kErrOther, fmt::format("Invalid ip::port: {}", addr)); + } + auto& [peer_ip, port] = *ip_port; + + // Connect target + auto ret = PRAFT.GetClusterCmdCtx().Set(ClusterCmdType::kJoin, client, std::move(peer_ip), port); + if (!ret) { // other clients have joined + return client->SetRes(CmdRes::kErrOther, "Other clients have joined"); + } + PRAFT.GetClusterCmdCtx().ConnectTargetNode(); + INFO("Sent join request to leader successfully"); + + // Not reply any message here, we will reply after the connection is established. + client->Clear(); +} + +} // namespace pikiwidb diff --git a/src/cmd_raft.h b/src/cmd_raft.h new file mode 100644 index 000000000..6a4c1f869 --- /dev/null +++ b/src/cmd_raft.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#pragma once + +#include + +#include "base_cmd.h" + +namespace pikiwidb { + +/* RAFT.NODE ADD [id] [address:port] + * Add a new node to the cluster. The [id] can be an explicit non-zero value, + * or zero to let the cluster choose one. + * Reply: + * -NOCLUSTER || + * -LOADING || + * -CLUSTERDOWN || + * -MOVED : || + * *2 + * : + * : + * + * RAFT.NODE REMOVE [id] + * Remove an existing node from the cluster. + * Reply: + * -NOCLUSTER || + * -LOADING || + * -CLUSTERDOWN || + * -MOVED : || + * +OK + */ +class RaftNodeCmd : public BaseCmd { + public: + RaftNodeCmd(const std::string &name, int16_t arity); + + protected: + bool DoInitial(PClient *client) override; + + private: + void DoCmd(PClient *client) override; + void DoCmdAdd(PClient *client); + void DoCmdRemove(PClient *client); + void DoCmdSnapshot(PClient *client); + + static constexpr std::string_view kAddCmd = "ADD"; + static constexpr std::string_view kRemoveCmd = "REMOVE"; + static constexpr std::string_view kDoSnapshot = "DOSNAPSHOT"; +}; + +/* RAFT.CLUSTER INIT + * Initializes a new Raft cluster. + * is an optional 32 character string, if set, cluster will use it for the id + * Reply: + * +OK [group_id] + * + * RAFT.CLUSTER JOIN [addr:port] + * Join an existing cluster. + * The operation is asynchronous and may take place/retry in the background. + * Reply: + * +OK + */ +class RaftClusterCmd : public BaseCmd { + public: + RaftClusterCmd(const std::string &name, int16_t arity); + + protected: + bool DoInitial(PClient *client) override; + + private: + void DoCmd(PClient *client) override; + void DoCmdInit(PClient *client); + void DoCmdJoin(PClient *client); + + static constexpr std::string_view kInitCmd = "INIT"; + static constexpr std::string_view kJoinCmd = "JOIN"; +}; + +} // namespace pikiwidb diff --git a/src/cmd_set.cc b/src/cmd_set.cc index 352b8e695..8a750dbb7 100644 --- a/src/cmd_set.cc +++ b/src/cmd_set.cc @@ -376,4 +376,5 @@ void SScanCmd::DoCmd(PClient* client) { client->AppendString(member); } } + } // namespace pikiwidb diff --git a/src/cmd_set.h b/src/cmd_set.h index 9b0ad6019..8f6253b19 100644 --- a/src/cmd_set.h +++ b/src/cmd_set.h @@ -178,4 +178,5 @@ class SScanCmd : public BaseCmd { static constexpr const char *kMatchSymbol = "match"; static constexpr const char *kCountSymbol = "count"; }; + } // namespace pikiwidb diff --git a/src/cmd_table_manager.cc b/src/cmd_table_manager.cc index f8429b836..26f910854 100644 --- a/src/cmd_table_manager.cc +++ b/src/cmd_table_manager.cc @@ -5,6 +5,8 @@ * of patent rights can be found in the PATENTS file in the same directory. */ +#include "cmd_table_manager.h" + #include #include "cmd_admin.h" @@ -12,8 +14,8 @@ #include "cmd_keys.h" #include "cmd_kv.h" #include "cmd_list.h" +#include "cmd_raft.h" #include "cmd_set.h" -#include "cmd_table_manager.h" #include "cmd_zset.h" namespace pikiwidb { @@ -45,6 +47,13 @@ void CmdTableManager::InitCmdTable() { ADD_COMMAND(Select, 2); ADD_COMMAND(Shutdown, 1); + // info + ADD_COMMAND(Info, -1); + + // raft + ADD_COMMAND(RaftCluster, -1); + ADD_COMMAND(RaftNode, -2); + // keyspace ADD_COMMAND(Del, -2); ADD_COMMAND(Exists, -2); diff --git a/src/cmd_thread_pool.h b/src/cmd_thread_pool.h index 3b65d6c87..3a0e867da 100644 --- a/src/cmd_thread_pool.h +++ b/src/cmd_thread_pool.h @@ -14,7 +14,7 @@ #include #include #include "base_cmd.h" -#include "pstd_status.h" +#include "pstd/pstd_status.h" namespace pikiwidb { diff --git a/src/config.cc b/src/config.cc index e8f1e03ff..96d2d8fe9 100644 --- a/src/config.cc +++ b/src/config.cc @@ -5,7 +5,6 @@ * of patent rights can be found in the PATENTS file in the same directory. */ -#include #include #include #include @@ -106,6 +105,7 @@ PConfig::PConfig() { AddBool("daemonize", &CheckYesNo, false, &daemonize); AddString("ip", false, {&ip}); AddNumberWihLimit("port", false, &port, PORT_LIMIT_MIN, PORT_LIMIT_MAX); + AddNumber("raft-port-offset", true, &raft_port_offset); AddNumber("timeout", true, &timeout); AddString("db-path", false, {&db_path}); AddStrinWithFunc("loglevel", &CheckLogLevel, false, {&log_level}); @@ -124,6 +124,7 @@ PConfig::PConfig() { AddString("runid", false, {&run_id}); AddNumber("small-compaction-threshold", true, &small_compaction_threshold); AddNumber("small-compaction-duration-threshold", true, &small_compaction_duration_threshold); + AddBool("use-raft", &CheckYesNo, false, &use_raft); // rocksdb config AddNumber("rocksdb-max-subcompactions", false, &rocksdb_max_subcompactions); diff --git a/src/config.h b/src/config.h index d0dd2b041..7b1196388 100644 --- a/src/config.h +++ b/src/config.h @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,7 @@ namespace pikiwidb { using Status = rocksdb::Status; using CheckFunc = std::function; class PConfig; + extern PConfig g_config; class BaseValue { @@ -144,6 +146,7 @@ class PConfig { AtomicString pid_file = "./pikiwidb.pid"; AtomicString ip = "127.0.0.1"; std::atomic_uint16_t port = 9221; + std::atomic_uint16_t raft_port_offset = 10; AtomicString db_path = "./db/"; AtomicString log_dir = "stdout"; // the log directory, differ from redis AtomicString log_level = "warning"; @@ -152,6 +155,7 @@ class PConfig { std::atomic_uint32_t worker_threads_num = 2; std::atomic_uint32_t slave_threads_num = 2; std::atomic db_instance_num = 3; + std::atomic_bool use_raft = true; std::atomic_uint32_t rocksdb_max_subcompactions = 0; // default 2 @@ -167,6 +171,8 @@ class PConfig { std::atomic_bool rocksdb_enable_pipelined_write = false; std::atomic_int rocksdb_level0_slowdown_writes_trigger = 20; std::atomic_int rocksdb_level0_stop_writes_trigger = 36; + std::atomic_uint64_t rocksdb_ttl_second = 604800; // default 86400 * 7 + std::atomic_uint64_t rocksdb_periodic_second = 259200; // default 86400 * 3 rocksdb::Options GetRocksDBOptions(); diff --git a/src/db.cc b/src/db.cc index 6fdbfe5ee..9b3c63b7f 100644 --- a/src/db.cc +++ b/src/db.cc @@ -6,33 +6,105 @@ */ #include "db.h" + #include "config.h" +#include "praft/praft.h" +#include "pstd/log.h" extern pikiwidb::PConfig g_config; namespace pikiwidb { -DB::DB(int db_id, const std::string &db_path) : db_id_(db_id), db_path_(db_path + std::to_string(db_id) + '/') { +DB::DB(int db_index, const std::string& db_path) + : db_index_(db_index), db_path_(db_path + std::to_string(db_index_) + '/') { storage::StorageOptions storage_options; storage_options.options = g_config.GetRocksDBOptions(); - // some options obj for all RocksDB in one DB. - auto cap = storage_options.db_instance_num * kColumnNum * storage_options.options.write_buffer_size * - storage_options.options.max_write_buffer_number; - storage_options.options.write_buffer_manager = std::make_shared(cap); + storage_options.db_instance_num = g_config.db_instance_num.load(); + storage_options.db_id = db_index_; + + // options for CF + storage_options.options.ttl = g_config.rocksdb_ttl_second.load(std::memory_order_relaxed); + storage_options.options.periodic_compaction_seconds = + g_config.rocksdb_periodic_second.load(std::memory_order_relaxed); + if (g_config.use_raft.load(std::memory_order_relaxed)) { + storage_options.append_log_function = [&r = PRAFT](const Binlog& log, std::promise&& promise) { + r.AppendLog(log, std::move(promise)); + }; + storage_options.do_snapshot_function = + std::bind(&pikiwidb::PRaft::DoSnapshot, &pikiwidb::PRAFT, std::placeholders::_1, std::placeholders::_2); + } + storage_ = std::make_unique(); + + if (auto s = storage_->Open(storage_options, db_path_); !s.ok()) { + ERROR("Storage open failed! {}", s.ToString()); + abort(); + } + + opened_ = true; + INFO("Open DB{} success!", db_index_); +} + +void DB::CreateCheckpoint(const std::string& checkpoint_path, bool sync) { + auto checkpoint_sub_path = checkpoint_path + '/' + std::to_string(db_index_); + if (0 != pstd::CreatePath(checkpoint_sub_path)) { + WARN("Create dir {} fail !", checkpoint_sub_path); + return; + } - storage_options.table_options = g_config.GetRocksDBBlockBasedTableOptions(); + std::shared_lock sharedLock(storage_mutex_); + auto result = storage_->CreateCheckpoint(checkpoint_sub_path); + if (sync) { + for (auto& r : result) { + r.get(); + } + } +} + +void DB::LoadDBFromCheckpoint(const std::string& checkpoint_path, bool sync [[maybe_unused]]) { + auto checkpoint_sub_path = checkpoint_path + '/' + std::to_string(db_index_); + if (0 != pstd::IsDir(checkpoint_sub_path)) { + WARN("Checkpoint dir {} does not exist!", checkpoint_sub_path); + return; + } + if (0 != pstd::IsDir(db_path_)) { + if (0 != pstd::CreateDir(db_path_)) { + WARN("Create dir {} fail !", db_path_); + return; + } + } + + std::lock_guard lock(storage_mutex_); + opened_ = false; + auto result = storage_->LoadCheckpoint(checkpoint_sub_path, db_path_); + + for (auto& r : result) { + r.get(); + } - storage_options.small_compaction_threshold = g_config.small_compaction_threshold.load(); - storage_options.small_compaction_duration_threshold = g_config.small_compaction_duration_threshold.load(); - storage_options.db_instance_num = g_config.db_instance_num; - storage_options.db_id = db_id; + storage::StorageOptions storage_options; + storage_options.options = g_config.GetRocksDBOptions(); + storage_options.db_instance_num = g_config.db_instance_num.load(); + storage_options.db_id = db_index_; + // options for CF + storage_options.options.ttl = g_config.rocksdb_ttl_second.load(std::memory_order_relaxed); + storage_options.options.periodic_compaction_seconds = + g_config.rocksdb_periodic_second.load(std::memory_order_relaxed); + if (g_config.use_raft.load(std::memory_order_relaxed)) { + storage_options.append_log_function = [&r = PRAFT](const Binlog& log, std::promise&& promise) { + r.AppendLog(log, std::move(promise)); + }; + storage_options.do_snapshot_function = + std::bind(&pikiwidb::PRaft::DoSnapshot, &pikiwidb::PRAFT, std::placeholders::_1, std::placeholders::_2); + } storage_ = std::make_unique(); + if (auto s = storage_->Open(storage_options, db_path_); !s.ok()) { ERROR("Storage open failed! {}", s.ToString()); abort(); } + opened_ = true; + INFO("DB{} load a checkpoint from {} success!", db_index_, checkpoint_path); } - } // namespace pikiwidb diff --git a/src/db.h b/src/db.h index 24e983171..6ed80d058 100644 --- a/src/db.h +++ b/src/db.h @@ -7,17 +7,18 @@ #pragma once +#include #include -#include "log.h" +#include "pstd/log.h" #include "pstd/noncopyable.h" #include "storage/storage.h" namespace pikiwidb { -constexpr int kColumnNum = 10; + class DB { public: - DB(int db_id, const std::string& db_path); + DB(int db_index, const std::string& db_path); std::unique_ptr& GetStorage() { return storage_; } @@ -29,10 +30,15 @@ class DB { void UnLockShared() { storage_mutex_.unlock_shared(); } + void CreateCheckpoint(const std::string& path, bool sync); + + void LoadDBFromCheckpoint(const std::string& path, bool sync = true); + + int GetDbIndex() { return db_index_; } + private: - const int db_id_ = 0; + const int db_index_ = 0; const std::string db_path_; - /** * If you want to change the pointer that points to storage, * you must first acquire a mutex lock. @@ -42,17 +48,6 @@ class DB { std::shared_mutex storage_mutex_; std::unique_ptr storage_; bool opened_ = false; - - /** - * If you want to change the status below,you must first acquire - * a mutex lock. - * If you only want to access the status below, - * you just need to obtain a shared lock. - */ - std::shared_mutex checkpoint_mutex_; - bool checkpoint_in_process_ = false; - int64_t last_checkpoint_time_ = -1; - bool last_checkpoint_success_ = false; }; } // namespace pikiwidb diff --git a/src/net/CMakeLists.txt b/src/net/CMakeLists.txt index 2a9bbc76f..8aceed319 100644 --- a/src/net/CMakeLists.txt +++ b/src/net/CMakeLists.txt @@ -1,3 +1,7 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. AUX_SOURCE_DIRECTORY(. NET_SRC) AUX_SOURCE_DIRECTORY(./lzf NET_SRC) diff --git a/src/pikiwidb.cc b/src/pikiwidb.cc index 87b51747c..d078880df 100644 --- a/src/pikiwidb.cc +++ b/src/pikiwidb.cc @@ -8,24 +8,24 @@ // // PikiwiDB.cc +#include "pikiwidb.h" + #include #include #include #include #include -#include "log.h" +#include "praft/praft.h" +#include "pstd/log.h" +#include "pstd/pstd_util.h" #include "client.h" -#include "store.h" - #include "config.h" -#include "slow_log.h" - #include "helper.h" -#include "pikiwidb.h" #include "pikiwidb_logo.h" -#include "pstd_util.h" +#include "slow_log.h" +#include "store.h" std::unique_ptr g_pikiwidb; using namespace pikiwidb; @@ -154,7 +154,7 @@ bool PikiwiDB::Init() { return false; } - PSTORE.Init(g_config.databases); + PSTORE.Init(g_config.databases.load(std::memory_order_relaxed)); PSlowLog::Instance().SetThreshold(g_config.slow_log_time.load()); PSlowLog::Instance().SetLogLimit(static_cast(g_config.slow_log_max_len.load())); @@ -194,6 +194,9 @@ void PikiwiDB::Run() { } void PikiwiDB::Stop() { + pikiwidb::PRAFT.ShutDown(); + pikiwidb::PRAFT.Join(); + pikiwidb::PRAFT.Clear(); slave_threads_.Exit(); worker_threads_.Exit(); cmd_threads_.Stop(); @@ -231,7 +234,6 @@ static void closeStd() { int main(int ac, char* av[]) { g_pikiwidb = std::make_unique(); - if (!g_pikiwidb->ParseArgs(ac - 1, av + 1)) { Usage(); return -1; diff --git a/src/pikiwidb.h b/src/pikiwidb.h index 5201fb875..64666aea3 100644 --- a/src/pikiwidb.h +++ b/src/pikiwidb.h @@ -8,9 +8,8 @@ #include "cmd_table_manager.h" #include "cmd_thread_pool.h" #include "common.h" -#include "event_loop.h" #include "io_thread_pool.h" -#include "tcp_connection.h" +#include "net/tcp_connection.h" #define KPIKIWIDB_VERSION "4.0.0" @@ -20,6 +19,10 @@ # define KPIKIWIDB_BUILD_TYPE "RELEASE" #endif +namespace pikiwidb { +class PRaft; +} // namespace pikiwidb + class PikiwiDB final { public: PikiwiDB() = default; diff --git a/src/praft/CMakeLists.txt b/src/praft/CMakeLists.txt new file mode 100644 index 000000000..45cf62f8c --- /dev/null +++ b/src/praft/CMakeLists.txt @@ -0,0 +1,46 @@ +# Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +ADD_CUSTOM_COMMAND( + OUTPUT "${PROTO_OUTPUT_DIR}/binlog.pb.cc" + DEPENDS extern_protobuf + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS -I ${CMAKE_CURRENT_SOURCE_DIR} + --cpp_out ${PROTO_OUTPUT_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/binlog.proto +) +ADD_LIBRARY(binlog_pb STATIC "${PROTO_OUTPUT_DIR}/binlog.pb.cc") + +ADD_CUSTOM_COMMAND( + OUTPUT "${PROTO_OUTPUT_DIR}/praft.pb.cc" + DEPENDS extern_protobuf + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS -I ${CMAKE_CURRENT_SOURCE_DIR} + --cpp_out ${PROTO_OUTPUT_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/praft.proto +) +ADD_LIBRARY(praft_pb STATIC "${PROTO_OUTPUT_DIR}/praft.pb.cc") + +FILE(GLOB PRAFT_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/*.cc" +) +SET(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) +ADD_LIBRARY(praft ${PRAFT_SRC}) + +TARGET_INCLUDE_DIRECTORIES(praft + PRIVATE ${PROJECT_SOURCE_DIR}/src + PRIVATE ${rocksdb_SOURCE_DIR}/include + PRIVATE ${BRAFT_INCLUDE_DIR} + PRIVATE ${BRPC_INCLUDE_DIR} + PRIVATE ${PROTO_OUTPUT_DIR} +) + +IF(CMAKE_SYSTEM_NAME STREQUAL "Linux") + SET(PRAFT_LIB ${PRAFT_LIB} rt) +ENDIF() + +TARGET_LINK_LIBRARIES(praft net; dl; fmt; storage; pstd braft brpc ssl crypto zlib protobuf leveldb gflags rocksdb z ${PRAFT_LIB}) + +SET_TARGET_PROPERTIES(praft PROPERTIES LINKER_LANGUAGE CXX) diff --git a/src/praft/binlog.proto b/src/praft/binlog.proto new file mode 100644 index 000000000..8f1dc3c99 --- /dev/null +++ b/src/praft/binlog.proto @@ -0,0 +1,22 @@ +syntax = "proto3"; +package pikiwidb; +option optimize_for = LITE_RUNTIME; + +enum OperateType { + kNoOperate = 0; + kPut = 1; + kDelete = 2; +} + +message BinlogEntry { + uint32 cf_idx = 1; + OperateType op_type = 2; + bytes key = 3; + optional bytes value = 4; +} + +message Binlog { + uint32 db_id = 1; + uint32 slot_idx = 2; + repeated BinlogEntry entries = 3; +} diff --git a/src/praft/praft.cc b/src/praft/praft.cc new file mode 100644 index 000000000..26239c743 --- /dev/null +++ b/src/praft/praft.cc @@ -0,0 +1,651 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#include "praft.h" + +#include + +#include "braft/snapshot.h" +#include "braft/util.h" +#include "brpc/server.h" + +#include "pstd/log.h" +#include "pstd/pstd_string.h" + +#include "binlog.pb.h" +#include "config.h" +#include "pikiwidb.h" +#include "replication.h" +#include "store.h" + +#include "praft_service.h" +#include "psnapshot.h" + +#define ERROR_LOG_AND_STATUS(msg) \ + ({ \ + ERROR(msg); \ + butil::Status(EINVAL, msg); \ + }) + +namespace pikiwidb { + +bool ClusterCmdContext::Set(ClusterCmdType cluster_cmd_type, PClient* client, std::string&& peer_ip, int port, + std::string&& peer_id) { + std::unique_lock lck(mtx_); + if (client_ != nullptr) { + return false; + } + assert(client); + cluster_cmd_type_ = cluster_cmd_type; + client_ = client; + peer_ip_ = std::move(peer_ip); + port_ = port; + peer_id_ = std::move(peer_id); + return true; +} + +void ClusterCmdContext::Clear() { + std::unique_lock lck(mtx_); + cluster_cmd_type_ = ClusterCmdType::kNone; + client_ = nullptr; + peer_ip_.clear(); + port_ = 0; + peer_id_.clear(); +} + +bool ClusterCmdContext::IsEmpty() { + std::unique_lock lck(mtx_); + return client_ == nullptr; +} + +void ClusterCmdContext::ConnectTargetNode() { + auto ip = PREPL.GetMasterAddr().GetIP(); + auto port = PREPL.GetMasterAddr().GetPort(); + if (ip == peer_ip_ && port == port_ && PREPL.GetMasterState() == kPReplStateConnected) { + PRAFT.SendNodeRequest(PREPL.GetMaster()); + return; + } + + // reconnect + auto fail_cb = [&](EventLoop*, const char* peer_ip, int port) { + PRAFT.OnClusterCmdConnectionFailed(EventLoop::Self(), peer_ip, port); + }; + PREPL.SetFailCallback(fail_cb); + PREPL.SetMasterState(kPReplStateNone); + PREPL.SetMasterAddr(peer_ip_.c_str(), port_); +} + +PRaft& PRaft::Instance() { + static PRaft store; + return store; +} + +butil::Status PRaft::Init(std::string& group_id, bool initial_conf_is_null) { + if (node_ && server_) { + return {0, "OK"}; + } + + server_ = std::make_unique(); + auto port = g_config.port + pikiwidb::g_config.raft_port_offset; + // Add your service into RPC server + DummyServiceImpl service(&PRAFT); + if (server_->AddService(&service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + server_.reset(); + return ERROR_LOG_AND_STATUS("Failed to add service"); + } + // raft can share the same RPC server. Notice the second parameter, because + // adding services into a running server is not allowed and the listen + // address of this server is impossible to get before the server starts. You + // have to specify the address of the server. + if (braft::add_service(server_.get(), port) != 0) { + server_.reset(); + return ERROR_LOG_AND_STATUS("Failed to add raft service"); + } + + // It's recommended to start the server before Counter is started to avoid + // the case that it becomes the leader while the service is unreacheable by + // clients. + // Notice the default options of server is used here. Check out details from + // the doc of brpc if you would like change some options; + if (server_->Start(port, nullptr) != 0) { + server_.reset(); + return ERROR_LOG_AND_STATUS("Failed to start server"); + } + // It's ok to start PRaft; + assert(group_id.size() == RAFT_GROUPID_LEN); + this->group_id_ = group_id; + + // FIXME: g_config.ip is default to 127.0.0.0, which may not work in cluster. + raw_addr_ = g_config.ip.ToString() + ":" + std::to_string(port); + butil::ip_t ip; + auto ret = butil::str2ip(g_config.ip.ToString().c_str(), &ip); + if (ret != 0) { + server_.reset(); + return ERROR_LOG_AND_STATUS("Failed to convert str_ip to butil::ip_t"); + } + butil::EndPoint addr(ip, port); + + // Default init in one node. + // initial_conf takes effect only when the replication group is started from an empty node. + // The Configuration is restored from the snapshot and log files when the data in the replication group is not empty. + // initial_conf is used only to create replication groups. + // The first node adds itself to initial_conf and then calls add_peer to add other nodes. + // Set initial_conf to empty for other nodes. + // You can also start empty nodes simultaneously by setting the same inital_conf(ip:port of multiple nodes) for + // multiple nodes. + std::string initial_conf; + if (!initial_conf_is_null) { + initial_conf = raw_addr_ + ":0,"; + } + if (node_options_.initial_conf.parse_from(initial_conf) != 0) { + server_.reset(); + return ERROR_LOG_AND_STATUS("Failed to parse configuration"); + } + + // node_options_.election_timeout_ms = FLAGS_election_timeout_ms; + node_options_.fsm = this; + node_options_.node_owns_fsm = false; + node_options_.snapshot_interval_s = 0; + std::string prefix = "local://" + g_config.db_path.ToString() + "_praft"; + node_options_.log_uri = prefix + "/log"; + node_options_.raft_meta_uri = prefix + "/raft_meta"; + node_options_.snapshot_uri = prefix + "/snapshot"; + // node_options_.disable_cli = FLAGS_disable_cli; + snapshot_adaptor_ = new PPosixFileSystemAdaptor(); + node_options_.snapshot_file_system_adaptor = &snapshot_adaptor_; + + node_ = std::make_unique("pikiwidb", braft::PeerId(addr)); // group_id + if (node_->init(node_options_) != 0) { + server_.reset(); + node_.reset(); + return ERROR_LOG_AND_STATUS("Failed to init raft node"); + } + + return {0, "OK"}; +} + +bool PRaft::IsLeader() const { + if (!node_) { + ERROR("Node is not initialized"); + return false; + } + return node_->is_leader(); +} + +std::string PRaft::GetLeaderID() const { + if (!node_) { + ERROR("Node is not initialized"); + return "Failed to get leader id"; + } + return node_->leader_id().to_string(); +} + +std::string PRaft::GetLeaderAddress() const { + if (!node_) { + ERROR("Node is not initialized"); + return "Failed to get leader id"; + } + auto id = node_->leader_id(); + id.addr.port -= g_config.raft_port_offset; + auto addr = butil::endpoint2str(id.addr); + return addr.c_str(); +} + +std::string PRaft::GetNodeID() const { + if (!node_) { + ERROR("Node is not initialized"); + return "Failed to get node id"; + } + return node_->node_id().to_string(); +} + +std::string PRaft::GetPeerID() const { + if (!node_) { + ERROR("Node is not initialized"); + return "Failed to get node id"; + } + + auto node_id = node_->node_id().to_string(); + auto pos = node_id.find(':'); + auto peer_id = node_id.substr(pos + 1, node_id.size()); + return peer_id; +} + +std::string PRaft::GetGroupID() const { + if (!node_) { + ERROR("Node is not initialized"); + return "Failed to get cluster id"; + } + return group_id_; +} + +braft::NodeStatus PRaft::GetNodeStatus() const { + braft::NodeStatus node_status; + if (!node_) { + ERROR("Node is not initialized"); + } else { + node_->get_status(&node_status); + } + + return node_status; +} + +butil::Status PRaft::GetListPeers(std::vector* peers) { + if (!node_) { + ERROR_LOG_AND_STATUS("Node is not initialized"); + } + return node_->list_peers(peers); +} + +void PRaft::SendNodeRequest(PClient* client) { + assert(client); + + auto cluster_cmd_type = cluster_cmd_ctx_.GetClusterCmdType(); + switch (cluster_cmd_type) { + case ClusterCmdType::kJoin: + SendNodeInfoRequest(client, "DATA"); + break; + case ClusterCmdType::kRemove: + SendNodeRemoveRequest(client); + break; + default: + client->SetRes(CmdRes::kErrOther, "the command sent to the leader is incorrect"); + break; + } +} + +// Gets the cluster id, which is used to initialize node +void PRaft::SendNodeInfoRequest(PClient* client, const std::string& info_type) { + assert(client); + + const std::string cmd_str = "INFO " + info_type + "\r\n"; + client->SendPacket(cmd_str); + client->Clear(); +} + +void PRaft::SendNodeAddRequest(PClient* client) { + assert(client); + + // Node id in braft are ip:port, the node id param in RAFT.NODE ADD cmd will be ignored. + int unused_node_id = 0; + auto port = g_config.port + pikiwidb::g_config.raft_port_offset; + auto raw_addr = g_config.ip.ToString() + ":" + std::to_string(port); + UnboundedBuffer req; + req.PushData("RAFT.NODE ADD ", 14); + req.PushData(std::to_string(unused_node_id).c_str(), std::to_string(unused_node_id).size()); + req.PushData(" ", 1); + req.PushData(raw_addr.data(), raw_addr.size()); + req.PushData("\r\n", 2); + client->SendPacket(req); + client->Clear(); +} + +void PRaft::SendNodeRemoveRequest(PClient* client) { + assert(client); + + UnboundedBuffer req; + req.PushData("RAFT.NODE REMOVE ", 17); + req.PushData(cluster_cmd_ctx_.GetPeerID().c_str(), cluster_cmd_ctx_.GetPeerID().size()); + req.PushData("\r\n", 2); + client->SendPacket(req); + client->Clear(); +} + +int PRaft::ProcessClusterCmdResponse(PClient* client, const char* start, int len) { + auto cluster_cmd_type = cluster_cmd_ctx_.GetClusterCmdType(); + int ret = 0; + switch (cluster_cmd_type) { + case ClusterCmdType::kJoin: + ret = PRAFT.ProcessClusterJoinCmdResponse(client, start, len); + break; + case ClusterCmdType::kRemove: + ret = PRAFT.ProcessClusterRemoveCmdResponse(client, start, len); + break; + default: + client->SetRes(CmdRes::kErrOther, "RAFT.CLUSTER response supports JOIN/REMOVE only"); + break; + } + + return ret; +} + +void PRaft::CheckRocksDBConfiguration(PClient* client, PClient* join_client, const std::string& reply) { + int databases_num = 0; + int rocksdb_num = 0; + std::string rockdb_version; + std::string line; + std::istringstream iss(reply); + + while (std::getline(iss, line)) { + std::string::size_type pos = line.find(':'); + if (pos != std::string::npos) { + std::string key = line.substr(0, pos); + std::string value = line.substr(pos + 1); + + if (key == DATABASES_NUM && pstd::String2int(value, &databases_num) == 0) { + join_client->SetRes(CmdRes::kErrOther, "Config of databases_num invalid"); + join_client->SendPacket(join_client->Message()); + join_client->Clear(); + // If the join fails, clear clusterContext and set it again by using the join command + cluster_cmd_ctx_.Clear(); + } else if (key == ROCKSDB_NUM && pstd::String2int(value, &rocksdb_num) == 0) { + join_client->SetRes(CmdRes::kErrOther, "Config of rocksdb_num invalid"); + join_client->SendPacket(join_client->Message()); + join_client->Clear(); + // If the join fails, clear clusterContext and set it again by using the join command + cluster_cmd_ctx_.Clear(); + } else if (key == ROCKSDB_VERSION) { + rockdb_version = pstd::StringTrimRight(value, "\r"); + } + } + } + + int current_databases_num = pikiwidb::g_config.databases; + int current_rocksdb_num = pikiwidb::g_config.db_instance_num; + std::string current_rocksdb_version = ROCKSDB_NAMESPACE::GetRocksVersionAsString(); + if (current_databases_num != databases_num || current_rocksdb_num != rocksdb_num || + current_rocksdb_version != rockdb_version) { + join_client->SetRes(CmdRes::kErrOther, "Config of databases_num, rocksdb_num or rocksdb_version mismatch"); + join_client->SendPacket(join_client->Message()); + join_client->Clear(); + // If the join fails, clear clusterContext and set it again by using the join command + cluster_cmd_ctx_.Clear(); + } else { + SendNodeInfoRequest(client, "RAFT"); + } +} + +void PRaft::LeaderRedirection(PClient* join_client, const std::string& reply) { + // Resolve the ip address of the leader + pstd::StringTrimLeft(reply, WRONG_LEADER); + pstd::StringTrim(reply); + braft::PeerId peerId; + peerId.parse(reply); + auto peer_ip = std::string(butil::ip2str(peerId.addr.ip).c_str()); + auto port = peerId.addr.port; + + // Reset the target of the connection + cluster_cmd_ctx_.Clear(); + auto ret = PRAFT.GetClusterCmdCtx().Set(ClusterCmdType::kJoin, join_client, std::move(peer_ip), port); + if (!ret) { // other clients have joined + join_client->SetRes(CmdRes::kErrOther, "Other clients have joined"); + join_client->SendPacket(join_client->Message()); + join_client->Clear(); + return; + } + PRAFT.GetClusterCmdCtx().ConnectTargetNode(); + + // Not reply any message here, we will reply after the connection is established. + join_client->Clear(); +} + +void PRaft::InitializeNodeBeforeAdd(PClient* client, PClient* join_client, const std::string& reply) { + std::string prefix = RAFT_GROUP_ID; + std::string::size_type prefix_length = prefix.length(); + std::string::size_type group_id_start = reply.find(prefix); + group_id_start += prefix_length; // locate the start location of "raft_group_id" + std::string::size_type group_id_end = reply.find("\r\n", group_id_start); + if (group_id_end != std::string::npos) { + std::string raft_group_id = reply.substr(group_id_start, group_id_end - group_id_start); + // initialize the slave node + auto s = PRAFT.Init(raft_group_id, true); + if (!s.ok()) { + join_client->SetRes(CmdRes::kErrOther, s.error_str()); + join_client->SendPacket(join_client->Message()); + join_client->Clear(); + // If the join fails, clear clusterContext and set it again by using the join command + cluster_cmd_ctx_.Clear(); + return; + } + + PRAFT.SendNodeAddRequest(client); + } else { + ERROR("Joined Raft cluster fail, because of invalid raft_group_id"); + join_client->SetRes(CmdRes::kErrOther, "Invalid raft_group_id"); + join_client->SendPacket(join_client->Message()); + join_client->Clear(); + // If the join fails, clear clusterContext and set it again by using the join command + cluster_cmd_ctx_.Clear(); + } +} + +int PRaft::ProcessClusterJoinCmdResponse(PClient* client, const char* start, int len) { + assert(start); + auto join_client = cluster_cmd_ctx_.GetClient(); + if (!join_client) { + WARN("No client when processing cluster join cmd response."); + return 0; + } + + std::string reply(start, len); + if (reply.find(OK_STR) != std::string::npos) { + INFO("Joined Raft cluster, node id: {}, group_id: {}", PRAFT.GetNodeID(), PRAFT.group_id_); + join_client->SetRes(CmdRes::kOK); + join_client->SendPacket(join_client->Message()); + join_client->Clear(); + // If the join fails, clear clusterContext and set it again by using the join command + cluster_cmd_ctx_.Clear(); + } else if (reply.find(DATABASES_NUM) != std::string::npos) { + CheckRocksDBConfiguration(client, join_client, reply); + } else if (reply.find(WRONG_LEADER) != std::string::npos) { + LeaderRedirection(join_client, reply); + } else if (reply.find(RAFT_GROUP_ID) != std::string::npos) { + InitializeNodeBeforeAdd(client, join_client, reply); + } else { + ERROR("Joined Raft cluster fail, str: {}", reply); + join_client->SetRes(CmdRes::kErrOther, reply); + join_client->SendPacket(join_client->Message()); + join_client->Clear(); + // If the join fails, clear clusterContext and set it again by using the join command + cluster_cmd_ctx_.Clear(); + } + + return len; +} + +int PRaft::ProcessClusterRemoveCmdResponse(PClient* client, const char* start, int len) { + assert(start); + auto remove_client = cluster_cmd_ctx_.GetClient(); + if (!remove_client) { + WARN("No client when processing cluster remove cmd response."); + return 0; + } + + std::string reply(start, len); + if (reply.find(OK_STR) != std::string::npos) { + INFO("Removed Raft cluster, node id: {}, group_id: {}", PRAFT.GetNodeID(), PRAFT.group_id_); + ShutDown(); + Join(); + Clear(); + + remove_client->SetRes(CmdRes::kOK); + remove_client->SendPacket(remove_client->Message()); + remove_client->Clear(); + } else if (reply.find(NOT_LEADER) != std::string::npos) { + auto remove_client = cluster_cmd_ctx_.GetClient(); + remove_client->Clear(); + remove_client->Reexecutecommand(); + } else { + ERROR("Removed Raft cluster fail, str: {}", reply); + remove_client->SetRes(CmdRes::kErrOther, reply); + remove_client->SendPacket(remove_client->Message()); + remove_client->Clear(); + } + + // If the remove fails, clear clusterContext and set it again by using the join command + cluster_cmd_ctx_.Clear(); + + return len; +} + +butil::Status PRaft::AddPeer(const std::string& peer) { + if (!node_) { + ERROR_LOG_AND_STATUS("Node is not initialized"); + } + + braft::SynchronizedClosure done; + node_->add_peer(peer, &done); + done.wait(); + + if (!done.status().ok()) { + WARN("Failed to add peer {} to node {}, status: {}", peer, node_->node_id().to_string(), done.status().error_str()); + return done.status(); + } + + return {0, "OK"}; +} + +butil::Status PRaft::RemovePeer(const std::string& peer) { + if (!node_) { + return ERROR_LOG_AND_STATUS("Node is not initialized"); + } + + braft::SynchronizedClosure done; + node_->remove_peer(peer, &done); + done.wait(); + + if (!done.status().ok()) { + WARN("Failed to remove peer {} from node {}, status: {}", peer, node_->node_id().to_string(), + done.status().error_str()); + return done.status(); + } + + return {0, "OK"}; +} + +butil::Status PRaft::DoSnapshot(int64_t self_snapshot_index, bool is_sync) { + if (!node_) { + return ERROR_LOG_AND_STATUS("Node is not initialized"); + } + braft::SynchronizedClosure done; + node_->snapshot(&done, self_snapshot_index); + done.wait(); + return done.status(); +} + +void PRaft::OnClusterCmdConnectionFailed([[maybe_unused]] EventLoop* loop, const char* peer_ip, int port) { + auto cli = cluster_cmd_ctx_.GetClient(); + if (cli) { + cli->SetRes(CmdRes::kErrOther, "Failed to connect to cluster for join or remove, please check logs " + + std::string(peer_ip) + ":" + std::to_string(port)); + cli->SendPacket(cli->Message()); + cli->Clear(); + } + cluster_cmd_ctx_.Clear(); + + PREPL.GetMasterAddr().Clear(); +} + +// Shut this node and server down. +void PRaft::ShutDown() { + if (node_) { + node_->shutdown(nullptr); + } + + if (server_) { + server_->Stop(0); + } +} + +// Blocking this thread until the node is eventually down. +void PRaft::Join() { + if (node_) { + node_->join(); + } + + if (server_) { + server_->Join(); + } +} + +void PRaft::AppendLog(const Binlog& log, std::promise&& promise) { + assert(node_); + assert(node_->is_leader()); + butil::IOBuf data; + butil::IOBufAsZeroCopyOutputStream wrapper(&data); + auto done = new PRaftWriteDoneClosure(std::move(promise)); + if (!log.SerializeToZeroCopyStream(&wrapper)) { + done->SetStatus(rocksdb::Status::Incomplete("Failed to serialize binlog")); + done->Run(); + return; + } + DEBUG("append binlog: {}", log.ShortDebugString()); + braft::Task task; + task.data = &data; + task.done = done; + node_->apply(task); +} + +// @braft::StateMachine +void PRaft::Clear() { + if (node_) { + node_.reset(); + } + + if (server_) { + server_.reset(); + } +} + +void PRaft::on_apply(braft::Iterator& iter) { + // A batch of tasks are committed, which must be processed through + for (; iter.valid(); iter.next()) { + auto done = iter.done(); + brpc::ClosureGuard done_guard(done); + + Binlog log; + butil::IOBufAsZeroCopyInputStream wrapper(iter.data()); + bool success = log.ParseFromZeroCopyStream(&wrapper); + DEBUG("apply binlog{}: {}", iter.index(), log.ShortDebugString()); + + if (!success) { + static constexpr std::string_view kMsg = "Failed to parse from protobuf when on_apply"; + ERROR(kMsg); + if (done) { // in leader + dynamic_cast(done)->SetStatus(rocksdb::Status::Incomplete(kMsg)); + } + braft::run_closure_in_bthread(done_guard.release()); + return; + } + + auto s = PSTORE.GetBackend(log.db_id())->GetStorage()->OnBinlogWrite(log, iter.index()); + if (done) { // in leader + dynamic_cast(done)->SetStatus(s); + } + // _applied_index = iter.index(); // consider to maintain a member applied_idx + braft::run_closure_in_bthread(done_guard.release()); + } +} + +void PRaft::on_snapshot_save(braft::SnapshotWriter* writer, braft::Closure* done) { + assert(writer); + brpc::ClosureGuard done_guard(done); +} + +int PRaft::on_snapshot_load(braft::SnapshotReader* reader) { + CHECK(!IsLeader()) << "Leader is not supposed to load snapshot"; + assert(reader); + auto reader_path = reader->get_path(); // xx/snapshot_0000001 + auto path = g_config.db_path.ToString() + std::to_string(db_id_); // db/db_id + TasksVector tasks(1, {TaskType::kLoadDBFromCheckpoint, db_id_, {{TaskArg::kCheckpointPath, reader_path}}, true}); + PSTORE.HandleTaskSpecificDB(tasks); + return 0; +} + +void PRaft::on_leader_start(int64_t term) { + WARN("Node {} start to be leader, term={}", node_->node_id().to_string(), term); +} + +void PRaft::on_leader_stop(const butil::Status& status) {} + +void PRaft::on_shutdown() {} +void PRaft::on_error(const ::braft::Error& e) {} +void PRaft::on_configuration_committed(const ::braft::Configuration& conf) {} +void PRaft::on_stop_following(const ::braft::LeaderChangeContext& ctx) {} +void PRaft::on_start_following(const ::braft::LeaderChangeContext& ctx) {} + +} // namespace pikiwidb diff --git a/src/praft/praft.h b/src/praft/praft.h new file mode 100644 index 000000000..65cc14d4f --- /dev/null +++ b/src/praft/praft.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "braft/file_system_adaptor.h" +#include "braft/raft.h" +#include "brpc/server.h" +#include "rocksdb/status.h" + +#include "client.h" + +namespace pikiwidb { + +#define RAFT_GROUPID_LEN 32 + +#define OK_STR "+OK" +#define DATABASES_NUM "databases_num" +#define ROCKSDB_NUM "rocksdb_num" +#define ROCKSDB_VERSION "rocksdb_version" +#define WRONG_LEADER "-ERR wrong leader" +#define RAFT_GROUP_ID "raft_group_id:" +#define NOT_LEADER "Not leader" + +#define PRAFT PRaft::Instance() + +class EventLoop; +class Binlog; + +enum ClusterCmdType { + kNone, + kJoin, + kRemove, +}; + +class ClusterCmdContext { + friend class PRaft; + + public: + ClusterCmdContext() = default; + ~ClusterCmdContext() = default; + + bool Set(ClusterCmdType cluster_cmd_type, PClient* client, std::string&& peer_ip, int port, + std::string&& peer_id = ""); + + void Clear(); + + // @todo the function seems useless + bool IsEmpty(); + + ClusterCmdType GetClusterCmdType() { return cluster_cmd_type_; } + PClient* GetClient() { return client_; } + const std::string& GetPeerIp() { return peer_ip_; } + int GetPort() { return port_; } + const std::string& GetPeerID() { return peer_id_; } + + void ConnectTargetNode(); + + private: + ClusterCmdType cluster_cmd_type_ = ClusterCmdType::kNone; + std::mutex mtx_; + PClient* client_ = nullptr; + std::string peer_ip_; + int port_ = 0; + std::string peer_id_; +}; + +class PRaftWriteDoneClosure : public braft::Closure { + public: + explicit PRaftWriteDoneClosure(std::promise&& promise) : promise_(std::move(promise)) {} + + void Run() override { + promise_.set_value(result_); + delete this; + } + void SetStatus(rocksdb::Status status) { result_ = std::move(status); } + + private: + std::promise promise_; + rocksdb::Status result_{rocksdb::Status::Aborted("Unknown error")}; +}; + +class PRaft : public braft::StateMachine { + public: + PRaft() = default; + ~PRaft() override = default; + + static PRaft& Instance(); + + //===--------------------------------------------------------------------===// + // Braft API + //===--------------------------------------------------------------------===// + butil::Status Init(std::string& group_id, bool initial_conf_is_null); + butil::Status AddPeer(const std::string& peer); + butil::Status RemovePeer(const std::string& peer); + butil::Status DoSnapshot(int64_t self_snapshot_index = 0, bool is_sync = true); + + void ShutDown(); + void Join(); + void AppendLog(const Binlog& log, std::promise&& promise); + void Clear(); + + //===--------------------------------------------------------------------===// + // Cluster command + //===--------------------------------------------------------------------===// + ClusterCmdContext& GetClusterCmdCtx() { return cluster_cmd_ctx_; } + void SendNodeRequest(PClient* client); + void SendNodeInfoRequest(PClient* client, const std::string& info_type); + void SendNodeAddRequest(PClient* client); + void SendNodeRemoveRequest(PClient* client); + + int ProcessClusterCmdResponse(PClient* client, const char* start, int len); + void CheckRocksDBConfiguration(PClient* client, PClient* join_client, const std::string& reply); + void LeaderRedirection(PClient* join_client, const std::string& reply); + void InitializeNodeBeforeAdd(PClient* client, PClient* join_client, const std::string& reply); + int ProcessClusterJoinCmdResponse(PClient* client, const char* start, int len); + int ProcessClusterRemoveCmdResponse(PClient* client, const char* start, int len); + + void OnClusterCmdConnectionFailed(EventLoop*, const char* peer_ip, int port); + + bool IsLeader() const; + std::string GetLeaderAddress() const; + std::string GetLeaderID() const; + std::string GetNodeID() const; + std::string GetPeerID() const; + std::string GetGroupID() const; + braft::NodeStatus GetNodeStatus() const; + butil::Status GetListPeers(std::vector* peers); + + bool IsInitialized() const { return node_ != nullptr && server_ != nullptr; } + + private: + void on_apply(braft::Iterator& iter) override; + void on_snapshot_save(braft::SnapshotWriter* writer, braft::Closure* done) override; + int on_snapshot_load(braft::SnapshotReader* reader) override; + + void on_leader_start(int64_t term) override; + void on_leader_stop(const butil::Status& status) override; + + void on_shutdown() override; + void on_error(const ::braft::Error& e) override; + void on_configuration_committed(const ::braft::Configuration& conf) override; + void on_stop_following(const ::braft::LeaderChangeContext& ctx) override; + void on_start_following(const ::braft::LeaderChangeContext& ctx) override; + + private: + std::unique_ptr server_{nullptr}; // brpc + std::unique_ptr node_{nullptr}; + braft::NodeOptions node_options_; // options for raft node + std::string raw_addr_; // ip:port of this node + + scoped_refptr snapshot_adaptor_ = nullptr; + ClusterCmdContext cluster_cmd_ctx_; // context for cluster join/remove command + std::string group_id_; // group id + int db_id_ = 0; // db_id +}; + +} // namespace pikiwidb diff --git a/src/praft/praft.proto b/src/praft/praft.proto new file mode 100644 index 000000000..61a495f21 --- /dev/null +++ b/src/praft/praft.proto @@ -0,0 +1,13 @@ +syntax="proto3"; +package pikiwidb; +option cc_generic_services = true; + +message DummyRequest { +}; + +message DummyResponse { +}; + +service DummyService { + rpc DummyMethod(DummyRequest) returns (DummyResponse); +}; diff --git a/src/praft/praft_service.h b/src/praft/praft_service.h new file mode 100644 index 000000000..d7b655a21 --- /dev/null +++ b/src/praft/praft_service.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#pragma once + +#include "praft.pb.h" + +namespace pikiwidb { + +class PRaft; + +class DummyServiceImpl : public DummyService { + public: + explicit DummyServiceImpl(PRaft* praft) : praft_(praft) {} + void DummyMethod(::google::protobuf::RpcController* controller, const ::pikiwidb::DummyRequest* request, + ::pikiwidb::DummyResponse* response, ::google::protobuf::Closure* done) override {} + + private: + PRaft* praft_ = nullptr; +}; + +} // namespace pikiwidb diff --git a/src/praft/psnapshot.cc b/src/praft/psnapshot.cc new file mode 100644 index 000000000..4cfc36d55 --- /dev/null +++ b/src/praft/psnapshot.cc @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +// +// psnapshot.cc + +#include "psnapshot.h" + +#include "braft/local_file_meta.pb.h" +#include "butil/files/file_path.h" + +#include "pstd/log.h" + +#include "config.h" +#include "store.h" + +namespace pikiwidb { + +extern PConfig g_config; + +braft::FileAdaptor* PPosixFileSystemAdaptor::open(const std::string& path, int oflag, + const ::google::protobuf::Message* file_meta, butil::File::Error* e) { + if ((oflag & IS_RDONLY) == 0) { // This is a read operation + bool snapshots_exists = false; + std::string snapshot_path; + + // parse snapshot path + butil::FilePath parse_snapshot_path(path); + std::vector components; + parse_snapshot_path.GetComponents(&components); + for (auto component : components) { + snapshot_path += component + "/"; + if (component.find("snapshot_") != std::string::npos) { + break; + } + } + // check whether snapshots have been created + std::lock_guard guard(mutex_); + if (!snapshot_path.empty()) { + for (const auto& entry : std::filesystem::directory_iterator(snapshot_path)) { + std::string filename = entry.path().filename().string(); + if (entry.is_regular_file() || entry.is_directory()) { + if (filename != "." && filename != ".." && filename.find(PRAFT_SNAPSHOT_META_FILE) == std::string::npos) { + // If the path directory contains files other than raft_snapshot_meta, snapshots have been generated + snapshots_exists = true; + break; + } + } + } + } + + // Snapshot generation + if (!snapshots_exists) { + braft::LocalSnapshotMetaTable snapshot_meta_memtable; + std::string meta_path = snapshot_path + "/" PRAFT_SNAPSHOT_META_FILE; + INFO("start to generate snapshot in path {}", snapshot_path); + braft::FileSystemAdaptor* fs = braft::default_file_system(); + assert(fs); + snapshot_meta_memtable.load_from_file(fs, meta_path); + + TasksVector tasks(1, {TaskType::kCheckpoint, 0, {{TaskArg::kCheckpointPath, snapshot_path}}, true}); + PSTORE.HandleTaskSpecificDB(tasks); + AddAllFiles(snapshot_path, &snapshot_meta_memtable, snapshot_path); + + auto rc = snapshot_meta_memtable.save_to_file(fs, meta_path); + if (rc == 0) { + INFO("Succeed to save snapshot in path {}", snapshot_path); + } else { + ERROR("Fail to save snapshot in path {}", snapshot_path); + } + INFO("generate snapshot completed in path {}", snapshot_path); + } + } + + return braft::PosixFileSystemAdaptor::open(path, oflag, file_meta, e); +} + +void PPosixFileSystemAdaptor::AddAllFiles(const std::filesystem::path& dir, + braft::LocalSnapshotMetaTable* snapshot_meta_memtable, + const std::string& path) { + assert(snapshot_meta_memtable); + for (const auto& entry : std::filesystem::directory_iterator(dir)) { + if (entry.is_directory()) { + if (entry.path() != "." && entry.path() != "..") { + INFO("dir_path = {}", entry.path().string()); + AddAllFiles(entry.path(), snapshot_meta_memtable, path); + } + } else { + INFO("file_path = {}", std::filesystem::relative(entry.path(), path).string()); + braft::LocalFileMeta meta; + if (snapshot_meta_memtable->add_file(std::filesystem::relative(entry.path(), path), meta) != 0) { + WARN("Failed to add file"); + } + } + } +} + +} // namespace pikiwidb diff --git a/src/praft/psnapshot.h b/src/praft/psnapshot.h new file mode 100644 index 000000000..3b544d53d --- /dev/null +++ b/src/praft/psnapshot.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#pragma once + +#include + +#include "braft/file_system_adaptor.h" +#include "braft/macros.h" +#include "braft/snapshot.h" + +#define PRAFT_SNAPSHOT_META_FILE "__raft_snapshot_meta" +#define PRAFT_SNAPSHOT_PATH "snapshot/snapshot_" +#define IS_RDONLY 0x01 + +namespace pikiwidb { + +class PPosixFileSystemAdaptor : public braft::PosixFileSystemAdaptor { + public: + PPosixFileSystemAdaptor() {} + ~PPosixFileSystemAdaptor() {} + + braft::FileAdaptor* open(const std::string& path, int oflag, const ::google::protobuf::Message* file_meta, + butil::File::Error* e) override; + void AddAllFiles(const std::filesystem::path& dir, braft::LocalSnapshotMetaTable* snapshot_meta_memtable, + const std::string& path); + + private: + braft::raft_mutex_t mutex_; +}; + +} // namespace pikiwidb diff --git a/src/pstd/CMakeLists.txt b/src/pstd/CMakeLists.txt index a67210604..002b84aa8 100644 --- a/src/pstd/CMakeLists.txt +++ b/src/pstd/CMakeLists.txt @@ -11,6 +11,7 @@ ADD_SUBDIRECTORY(tests) TARGET_INCLUDE_DIRECTORIES(pstd PRIVATE ${rocksdb_SOURCE_DIR}/include + PRIVATE ${GLOG_INCLUDE_DIR} ) TARGET_LINK_LIBRARIES(pstd; spdlog pthread) diff --git a/src/pstd/pstd_string.cc b/src/pstd/pstd_string.cc index 165b1d5d8..15e33b47b 100644 --- a/src/pstd/pstd_string.cc +++ b/src/pstd/pstd_string.cc @@ -652,4 +652,10 @@ bool IsValidNumber(const std::string& str) { return true; } +void TrimSlash(std::string& dirName) { + while (dirName.back() == '/') { + dirName.pop_back(); + } +} + } // namespace pstd diff --git a/src/pstd/pstd_string.h b/src/pstd/pstd_string.h index d1ccb14fb..de139dfe3 100755 --- a/src/pstd/pstd_string.h +++ b/src/pstd/pstd_string.h @@ -95,4 +95,6 @@ bool StringHasSpaces(const std::string& str); bool IsValidNumber(const std::string& str); +void TrimSlash(std::string& dirName); + } // namespace pstd diff --git a/src/pstd/thread_pool.h b/src/pstd/thread_pool.h index 77f188f4f..331e47a37 100644 --- a/src/pstd/thread_pool.h +++ b/src/pstd/thread_pool.h @@ -27,7 +27,7 @@ class ThreadPool final { void operator=(const ThreadPool&) = delete; template - auto ExecuteTask(F&& f, Args&&... args) -> std::future::type>; + auto ExecuteTask(F&& f, Args&&... args) -> std::future>; void JoinAll(); void SetMaxIdleThread(unsigned int m); @@ -48,17 +48,17 @@ class ThreadPool final { std::condition_variable cond_; unsigned waiters_; bool shutdown_; - std::deque > tasks_; + std::deque> tasks_; static const int kMaxThreads = 256; }; template -auto ThreadPool::ExecuteTask(F&& f, Args&&... args) -> std::future::type> { - using resultType = typename std::invoke_result::type; +auto ThreadPool::ExecuteTask(F&& f, Args&&... args) -> std::future> { + using resultType = std::invoke_result_t; auto task = - std::make_shared >(std::bind(std::forward(f), std::forward(args)...)); + std::make_shared>(std::bind(std::forward(f), std::forward(args)...)); { std::unique_lock guard(mutex_); diff --git a/src/replication.cc b/src/replication.cc index 3c10aac92..a1f1125f7 100644 --- a/src/replication.cc +++ b/src/replication.cc @@ -192,6 +192,7 @@ void PReplication::Cron() { g_pikiwidb->OnNewConnection(obj); } }; + auto fail_cb = [&](EventLoop*, const char* peer_ip, int port) { WARN("OnCallback: Connect master {}:{} failed", peer_ip, port); @@ -199,6 +200,11 @@ void PReplication::Cron() { if (!masterInfo_.downSince) { masterInfo_.downSince = ::time(nullptr); } + + if (on_fail_) { + on_fail_(EventLoop::Self(), peer_ip, port); + on_fail_ = nullptr; + } }; auto loop = EventLoop::Self(); @@ -208,20 +214,7 @@ void PReplication::Cron() { } break; case kPReplStateConnected: - if (!g_config.master_auth.empty()) { - if (auto master = master_.lock()) { - UnboundedBuffer req; - req.PushData("auth "); - req.PushData(g_config.master_auth.ToString().data(), g_config.master_auth.ToString().size()); - req.PushData("\r\n"); - master->SendPacket(req); - INFO("send auth with password {}", g_config.master_auth.ToString()); - - masterInfo_.state = kPReplStateWaitAuth; - break; - } - } - // fall through to next case. + break; case kPReplStateWaitAuth: { auto master = master_.lock(); diff --git a/src/replication.h b/src/replication.h index 9d390f8b4..56ae7c15b 100644 --- a/src/replication.h +++ b/src/replication.h @@ -12,9 +12,10 @@ #include #include "common.h" -#include "memory_file.h" +#include "net/tcp_connection.h" +#include "net/unbounded_buffer.h" #include "net/util.h" -#include "unbounded_buffer.h" +#include "pstd/memory_file.h" namespace pikiwidb { @@ -126,12 +127,14 @@ class PReplication { void SendToSlaves(const std::vector& params); // slave side + void SetFailCallback(TcpConnectionFailCallback cb) { on_fail_ = std::move(cb); } void SaveTmpRdb(const char* data, std::size_t& len); void SetMaster(const std::shared_ptr& cli); void SetMasterState(PReplState s); void SetMasterAddr(const char* ip, uint16_t port); void SetRdbSize(std::size_t s); PReplState GetMasterState() const; + PClient* GetMaster() const { return master_.lock().get(); } SocketAddr GetMasterAddr() const; std::size_t GetRdbSize() const; @@ -151,6 +154,9 @@ class PReplication { PMasterInfo masterInfo_; std::weak_ptr master_; pstd::OutputMemoryFile rdb_; + + // Callback function that failed to connect to the master node + TcpConnectionFailCallback on_fail_ = nullptr; }; } // namespace pikiwidb diff --git a/src/storage/CMakeLists.txt b/src/storage/CMakeLists.txt index 87f0b3ca8..48773729d 100644 --- a/src/storage/CMakeLists.txt +++ b/src/storage/CMakeLists.txt @@ -1,4 +1,8 @@ -#AUX_SOURCE_DIRECTORY(./src STORAGE_SRC) +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + FILE(GLOB STORAGE_SRC "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.h" @@ -9,12 +13,27 @@ ADD_LIBRARY(storage ${STORAGE_SRC}) TARGET_INCLUDE_DIRECTORIES(storage PUBLIC ${CMAKE_SOURCE_DIR}/src - PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} - PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include - PRIVATE ${rocksdb_SOURCE_DIR} + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include + PRIVATE ${rocksdb_SOURCE_DIR}/ PRIVATE ${rocksdb_SOURCE_DIR}/include - ) + PRIVATE ${PROTO_OUTPUT_DIR} +) -TARGET_LINK_LIBRARIES(storage pstd rocksdb) +TARGET_LINK_LIBRARIES (storage + pstd + braft + brpc + ssl + crypto + zlib + leveldb + gflags + rocksdb + binlog_pb + protobuf +) SET_TARGET_PROPERTIES(storage PROPERTIES LINKER_LANGUAGE CXX) + +ADD_SUBDIRECTORY(tests) diff --git a/src/storage/include/storage/storage.h b/src/storage/include/storage/storage.h index 3a8af8f6b..4536a7e41 100644 --- a/src/storage/include/storage/storage.h +++ b/src/storage/include/storage/storage.h @@ -7,7 +7,10 @@ #define INCLUDE_STORAGE_STORAGE_H_ #include -#include +#include +#include +#include +#include #include #include #include @@ -22,9 +25,14 @@ #include "rocksdb/status.h" #include "rocksdb/table.h" +#include "pstd/env.h" #include "pstd/pstd_mutex.h" #include "storage/slot_indexer.h" +namespace pikiwidb { +class Binlog; +} + namespace storage { inline constexpr double ZSET_SCORE_MAX = std::numeric_limits::max(); @@ -37,10 +45,15 @@ inline const std::string PROPERTY_TYPE_ROCKSDB_BACKGROUND_ERRORS = "rocksdb.back inline constexpr size_t BATCH_DELETE_LIMIT = 100; inline constexpr size_t COMPACT_THRESHOLD_COUNT = 2000; +inline constexpr uint64_t kNoFlush = std::numeric_limits::max(); +inline constexpr uint64_t kFlush = 0; + using Options = rocksdb::Options; using BlockBasedTableOptions = rocksdb::BlockBasedTableOptions; using Status = rocksdb::Status; using Slice = rocksdb::Slice; +using Env = rocksdb::Env; +using LogIndex = int64_t; class Redis; enum class OptionType; @@ -48,8 +61,11 @@ enum class OptionType; template class LRUCache; +using AppendLogFunction = std::function&&)>; +using DoSnapshotFunction = std::function; + struct StorageOptions { - rocksdb::Options options; + mutable rocksdb::Options options; rocksdb::BlockBasedTableOptions table_options; size_t block_cache_size = 0; bool share_block_cache = false; @@ -57,7 +73,13 @@ struct StorageOptions { size_t small_compaction_threshold = 5000; size_t small_compaction_duration_threshold = 10000; size_t db_instance_num = 3; // default = 3 - int db_id; + int db_id = 0; + AppendLogFunction append_log_function = nullptr; + DoSnapshotFunction do_snapshot_function = nullptr; + + uint32_t raft_timeout_s = std::numeric_limits::max(); + int64_t max_gap = 1000; + uint64_t mem_manager_size = 100000000; Status ResetOptions(const OptionType& option_type, const std::unordered_map& options_map); }; @@ -164,6 +186,14 @@ class Storage { Status Open(const StorageOptions& storage_options, const std::string& db_path); + std::vector> CreateCheckpoint(const std::string& checkpoint_path); + + Status CreateCheckpointInternal(const std::string& checkpoint_path, int db_index); + + std::vector> LoadCheckpoint(const std::string& checkpoint_path, const std::string& db_path); + + Status LoadCheckpointInternal(const std::string& dump_path, const std::string& db_path, int index); + Status LoadCursorStartKey(const DataType& dtype, int64_t cursor, char* type, std::string* start_key); Status StoreCursorStartKey(const DataType& dtype, int64_t cursor, char type, const std::string& next_key); @@ -1078,6 +1108,7 @@ class Storage { Status SetOptions(const OptionType& option_type, const std::unordered_map& options); void GetRocksDBInfo(std::string& info); + Status OnBinlogWrite(const pikiwidb::Binlog& log, LogIndex log_idx); private: std::vector> insts_; diff --git a/src/storage/include/storage/storage_define.h b/src/storage/include/storage/storage_define.h index 053177c17..4b27c860c 100644 --- a/src/storage/include/storage/storage_define.h +++ b/src/storage/include/storage/storage_define.h @@ -42,6 +42,7 @@ enum ColumnFamilyIndex { kZsetsMetaCF = 7, kZsetsDataCF = 8, kZsetsScoreCF = 9, + kColumnFamilyNum = 10, }; const static char kNeedTransformCharacter = '\u0000'; diff --git a/src/storage/src/batch.h b/src/storage/src/batch.h new file mode 100644 index 000000000..ad4df2d9f --- /dev/null +++ b/src/storage/src/batch.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/db.h" + +#include "binlog.pb.h" +#include "src/redis.h" +#include "storage/storage.h" +#include "storage/storage_define.h" + +namespace storage { + +class Batch { + public: + virtual ~Batch() = default; + + virtual void Put(ColumnFamilyIndex cf_idx, const Slice& key, const Slice& val) = 0; + virtual void Delete(ColumnFamilyIndex cf_idx, const Slice& key) = 0; + virtual Status Commit() = 0; + int32_t Count() const { return cnt_; } + + static auto CreateBatch(Redis* redis) -> std::unique_ptr; + + protected: + uint32_t cnt_ = 0; +}; + +class RocksBatch : public Batch { + public: + RocksBatch(rocksdb::DB* db, const rocksdb::WriteOptions& options, + const std::vector& handles) + : db_(db), options_(options), handles_(handles) {} + + void Put(ColumnFamilyIndex cf_idx, const Slice& key, const Slice& val) override { + batch_.Put(handles_[cf_idx], key, val); + cnt_++; + } + void Delete(ColumnFamilyIndex cf_idx, const Slice& key) override { + batch_.Delete(handles_[cf_idx], key); + cnt_++; + } + Status Commit() override { return db_->Write(options_, &batch_); } + + private: + rocksdb::WriteBatch batch_; + rocksdb::DB* db_ = nullptr; + const rocksdb::WriteOptions& options_; + const std::vector& handles_; +}; + +class BinlogBatch : public Batch { + public: + BinlogBatch(AppendLogFunction func, int32_t index, uint32_t seconds = 10) + : func_(std::move(func)), seconds_(seconds) { + binlog_.set_db_id(0); + binlog_.set_slot_idx(index); + } + + void Put(ColumnFamilyIndex cf_idx, const Slice& key, const Slice& value) override { + auto entry = binlog_.add_entries(); + entry->set_cf_idx(cf_idx); + entry->set_op_type(pikiwidb::OperateType::kPut); + entry->set_key(key.ToString()); + entry->set_value(value.ToString()); + cnt_++; + } + + void Delete(ColumnFamilyIndex cf_idx, const Slice& key) override { + auto entry = binlog_.add_entries(); + entry->set_cf_idx(cf_idx); + entry->set_op_type(pikiwidb::OperateType::kDelete); + entry->set_key(key.ToString()); + cnt_++; + } + + Status Commit() override { + // FIXME(longfar): We should make sure that in non-RAFT mode, the code doesn't run here + std::promise promise; + auto future = promise.get_future(); + func_(binlog_, std::move(promise)); + auto status = future.wait_for(std::chrono::seconds(seconds_)); + if (status == std::future_status::timeout) { + return Status::Incomplete("Wait for write timeout"); + } + return future.get(); + } + + private: + AppendLogFunction func_; + pikiwidb::Binlog binlog_; + uint32_t seconds_ = 10; +}; + +inline auto Batch::CreateBatch(Redis* redis) -> std::unique_ptr { + if (redis->GetAppendLogFunction()) { + return std::make_unique(redis->GetAppendLogFunction(), redis->GetIndex(), redis->GetRaftTimeout()); + } + return std::make_unique(redis->GetDB(), redis->GetWriteOptions(), redis->GetColumnFamilyHandles()); +} + +} // namespace storage diff --git a/src/storage/src/debug.h b/src/storage/src/debug.h index 882b718d1..8d2888a67 100644 --- a/src/storage/src/debug.h +++ b/src/storage/src/debug.h @@ -7,10 +7,7 @@ #ifndef NDEBUG # define TRACE(M, ...) fprintf(stderr, "[TRACE] (%s:%d) " M "\n", __FILE__, __LINE__, ##__VA_ARGS__) -# define DEBUG(M, ...) fprintf(stderr, "[Debug] (%s:%d) " M "\n", __FILE__, __LINE__, ##__VA_ARGS__) #else # define TRACE(M, ...) \ {} -# define DEBUG(M, ...) \ - {} #endif // NDEBUG diff --git a/src/storage/src/log_index.cc b/src/storage/src/log_index.cc new file mode 100644 index 000000000..1dede3013 --- /dev/null +++ b/src/storage/src/log_index.cc @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#include "log_index.h" + +#include +#include +#include + +#include "redis.h" + +namespace storage { + +rocksdb::Status storage::LogIndexOfColumnFamilies::Init(Redis *db) { + for (int i = 0; i < cf_.size(); i++) { + rocksdb::TablePropertiesCollection collection; + auto s = db->GetDB()->GetPropertiesOfAllTables(db->GetColumnFamilyHandles()[i], &collection); + if (!s.ok()) { + return s; + } + auto res = LogIndexTablePropertiesCollector::GetLargestLogIndexFromTableCollection(collection); + if (res.has_value()) { + auto log_index = res->GetAppliedLogIndex(); + auto sequence_number = res->GetSequenceNumber(); + cf_[i].applied_index.SetLogIndexSeqnoPair(log_index, sequence_number); + cf_[i].flushed_index.SetLogIndexSeqnoPair(log_index, sequence_number); + } + } + return Status::OK(); +} + +LogIndexOfColumnFamilies::SmallestIndexRes LogIndexOfColumnFamilies::GetSmallestLogIndex(int flush_cf) const { + SmallestIndexRes res; + for (int i = 0; i < cf_.size(); i++) { + if (i != flush_cf && cf_[i].flushed_index >= cf_[i].applied_index) { + continue; + } + auto applied_log_index = cf_[i].applied_index.GetLogIndex(); + auto flushed_log_index = cf_[i].flushed_index.GetLogIndex(); + auto flushed_seqno = cf_[i].flushed_index.GetSequenceNumber(); + if (applied_log_index < res.smallest_applied_log_index) { + res.smallest_applied_log_index = applied_log_index; + res.smallest_applied_log_index_cf = i; + } + if (flushed_log_index < res.smallest_flushed_log_index) { + res.smallest_flushed_log_index = flushed_log_index; + res.smallest_flushed_seqno = flushed_seqno; + res.smallest_flushed_log_index_cf = i; + } + } + return res; +} + +size_t LogIndexOfColumnFamilies::GetPendingFlushGap() const { + std::set s; + for (int i = 0; i < kColumnFamilyNum; i++) { + s.insert(cf_[i].applied_index.GetLogIndex()); + s.insert(cf_[i].flushed_index.GetLogIndex()); + } + assert(!s.empty()); + if (s.size() == 1) { + return false; + } + auto iter_first = s.begin(); + auto iter_last = s.end(); + return *std::prev(iter_last) - *iter_first; +}; + +std::atomic_int64_t LogIndexAndSequenceCollector::max_gap_ = 1000; + +std::optional storage::LogIndexTablePropertiesCollector::ReadStatsFromTableProps( + const std::shared_ptr &table_props) { + const auto &user_properties = table_props->user_collected_properties; + const auto it = user_properties.find(kPropertyName.data()); + if (it == user_properties.end()) { + return std::nullopt; + } + std::string s = it->second; + LogIndex applied_log_index; + SequenceNumber largest_seqno; + auto res = sscanf(s.c_str(), "%" PRIi64 "/%" PRIu64 "", &applied_log_index, &largest_seqno); + assert(res == 2); + + return LogIndexAndSequencePair(applied_log_index, largest_seqno); +} + +LogIndex LogIndexAndSequenceCollector::FindAppliedLogIndex(SequenceNumber seqno) const { + if (seqno == 0) { // the seqno will be 0 when executing compaction + return 0; + } + std::shared_lock gd(mutex_); + if (list_.empty() || seqno < list_.front().GetSequenceNumber()) { + return 0; + } + if (seqno >= list_.back().GetSequenceNumber()) { + return list_.back().GetAppliedLogIndex(); + } + + auto it = std::lower_bound( + list_.begin(), list_.end(), seqno, + [](const LogIndexAndSequencePair &p, SequenceNumber tar) { return p.GetSequenceNumber() <= tar; }); + if (it->GetSequenceNumber() > seqno) { + --it; + } + assert(it->GetSequenceNumber() <= seqno); + return it->GetAppliedLogIndex(); +} + +void LogIndexAndSequenceCollector::Update(LogIndex smallest_applied_log_index, SequenceNumber smallest_flush_seqno) { + // If step length > 1, log index is sampled and sacrifice precision to save memory usage. + // It means that extra applied log may be applied again on start stage. + if ((smallest_applied_log_index & step_length_mask_) == 0) { + std::lock_guard gd(mutex_); + list_.emplace_back(smallest_applied_log_index, smallest_flush_seqno); + } +} + +// TODO(longfar): find the iterator which should be deleted and erase from begin to the iterator +void LogIndexAndSequenceCollector::Purge(LogIndex smallest_applied_log_index) { + // The reason that we use smallest applied log index of all column families instead of smallest flushed log index is + // that the log index corresponding to the largest sequence number in the next flush must be greater than or equal to + // the smallest applied log index at this moment. + // So we just need to make sure that there is an element in the queue which is less than or equal to the smallest + // applied log index to ensure that we can find a correct log index while doing next flush. + std::lock_guard gd(mutex_); + if (list_.size() < 2) { + return; + } + auto second = std::next(list_.begin()); + while (list_.size() >= 2 && second->GetAppliedLogIndex() <= smallest_applied_log_index) { + list_.pop_front(); + ++second; + } +} + +auto LogIndexTablePropertiesCollector::GetLargestLogIndexFromTableCollection( + const rocksdb::TablePropertiesCollection &collection) -> std::optional { + LogIndex max_flushed_log_index{-1}; + rocksdb::SequenceNumber seqno{}; + for (const auto &[_, props] : collection) { + auto res = LogIndexTablePropertiesCollector::ReadStatsFromTableProps(props); + if (res.has_value() && res->GetAppliedLogIndex() > max_flushed_log_index) { + max_flushed_log_index = res->GetAppliedLogIndex(); + seqno = res->GetSequenceNumber(); + } + } + return max_flushed_log_index == -1 ? std::nullopt + : std::make_optional(max_flushed_log_index, seqno); +} + +void LogIndexAndSequenceCollectorPurger::OnFlushCompleted(rocksdb::DB *db, + const rocksdb::FlushJobInfo &flush_job_info) { + cf_->SetFlushedLogIndex(flush_job_info.cf_id, collector_->FindAppliedLogIndex(flush_job_info.largest_seqno), + flush_job_info.largest_seqno); + + auto [smallest_applied_log_index_cf, smallest_applied_log_index, smallest_flushed_log_index_cf, + smallest_flushed_log_index, smallest_flushed_seqno] = cf_->GetSmallestLogIndex(flush_job_info.cf_id); + collector_->Purge(smallest_applied_log_index); + + if (smallest_flushed_log_index_cf != -1) { + cf_->SetFlushedLogIndexGlobal(smallest_flushed_log_index, smallest_flushed_seqno); + } + auto count = count_.fetch_add(1); + + if (count % 10 == 0) { + callback_(smallest_flushed_log_index, false); + } + + if (flush_job_info.cf_id == manul_flushing_cf_.load()) { + manul_flushing_cf_.store(-1); + } + + auto flushing_cf = manul_flushing_cf_.load(); + if (flushing_cf != -1 || !collector_->IsFlushPending()) { + return; + } + + assert(flushing_cf == -1); + + if (!manul_flushing_cf_.compare_exchange_strong(flushing_cf, smallest_flushed_log_index_cf)) { + return; + } + + assert(manul_flushing_cf_.load() == smallest_flushed_log_index_cf); + rocksdb::FlushOptions flush_option; + flush_option.wait = false; + db->Flush(flush_option, column_families_->at(smallest_flushed_log_index_cf)); +} + +} // namespace storage \ No newline at end of file diff --git a/src/storage/src/log_index.h b/src/storage/src/log_index.h new file mode 100644 index 000000000..e7eb31cbc --- /dev/null +++ b/src/storage/src/log_index.h @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fmt/core.h" +#include "rocksdb/db.h" +#include "rocksdb/listener.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/types.h" + +#include "storage/storage_define.h" + +namespace storage { + +using LogIndex = int64_t; +using rocksdb::SequenceNumber; +class Redis; + +class LogIndexAndSequencePair { + public: + LogIndexAndSequencePair(LogIndex applied_log_index, SequenceNumber seqno) + : applied_log_index_(applied_log_index), seqno_(seqno) {} + + void SetAppliedLogIndex(LogIndex applied_log_index) { applied_log_index_ = applied_log_index; } + void SetSequenceNumber(SequenceNumber seqno) { seqno_ = seqno; } + + LogIndex GetAppliedLogIndex() const { return applied_log_index_; } + SequenceNumber GetSequenceNumber() const { return seqno_; } + + private: + LogIndex applied_log_index_ = 0; + SequenceNumber seqno_ = 0; +}; + +struct LogIndexSeqnoPair { + std::atomic log_index = 0; + std::atomic seqno = 0; + + LogIndex GetLogIndex() const { return log_index.load(); } + + SequenceNumber GetSequenceNumber() const { return seqno.load(); } + + void SetLogIndexSeqnoPair(LogIndex l, SequenceNumber s) { + log_index.store(l); + seqno.store(s); + } + + LogIndexSeqnoPair() = default; + + bool operator==(const LogIndexSeqnoPair &other) const { return seqno.load() == other.seqno.load(); } + + bool operator<=(const LogIndexSeqnoPair &other) const { return seqno.load() <= other.seqno.load(); } + + bool operator>=(const LogIndexSeqnoPair &other) const { return seqno.load() >= other.seqno.load(); } + + bool operator<(const LogIndexSeqnoPair &other) const { return seqno.load() < other.seqno.load(); } +}; + +class LogIndexOfColumnFamilies { + struct LogIndexPair { + LogIndexSeqnoPair applied_index; // newest record in memtable. + LogIndexSeqnoPair flushed_index; // newest record in sst file. + }; + + struct SmallestIndexRes { + int smallest_applied_log_index_cf = -1; + LogIndex smallest_applied_log_index = std::numeric_limits::max(); + + int smallest_flushed_log_index_cf = -1; + LogIndex smallest_flushed_log_index = std::numeric_limits::max(); + SequenceNumber smallest_flushed_seqno = std::numeric_limits::max(); + }; + + public: + // Read the largest log index of each column family from all sst files + rocksdb::Status Init(Redis *db); + + SmallestIndexRes GetSmallestLogIndex(int flush_cf) const; + + void SetFlushedLogIndex(size_t cf_id, LogIndex log_index, SequenceNumber seqno) { + cf_[cf_id].flushed_index.log_index.store(std::max(cf_[cf_id].flushed_index.log_index.load(), log_index)); + cf_[cf_id].flushed_index.seqno.store(std::max(cf_[cf_id].flushed_index.seqno.load(), seqno)); + } + + void SetFlushedLogIndexGlobal(LogIndex log_index, SequenceNumber seqno) { + SetLastFlushIndex(log_index, seqno); + for (int i = 0; i < kColumnFamilyNum; i++) { + if (cf_[i].flushed_index <= last_flush_index_) { + auto flush_log_index = std::max(cf_[i].flushed_index.GetLogIndex(), last_flush_index_.GetLogIndex()); + auto flush_sequence_number = + std::max(cf_[i].flushed_index.GetSequenceNumber(), last_flush_index_.GetSequenceNumber()); + cf_[i].flushed_index.SetLogIndexSeqnoPair(flush_log_index, flush_sequence_number); + } + } + } + + bool IsApplied(size_t cf_id, LogIndex cur_log_index) const { + return cur_log_index < cf_[cf_id].applied_index.GetLogIndex(); + } + + void Update(size_t cf_id, LogIndex cur_log_index, SequenceNumber cur_seqno) { + if (cf_[cf_id].flushed_index <= last_flush_index_ && cf_[cf_id].flushed_index == cf_[cf_id].applied_index) { + auto flush_log_index = std::max(cf_[cf_id].flushed_index.GetLogIndex(), last_flush_index_.GetLogIndex()); + auto flush_sequence_number = + std::max(cf_[cf_id].flushed_index.GetSequenceNumber(), last_flush_index_.GetSequenceNumber()); + cf_[cf_id].flushed_index.SetLogIndexSeqnoPair(flush_log_index, flush_sequence_number); + } + + cf_[cf_id].applied_index.SetLogIndexSeqnoPair(cur_log_index, cur_seqno); + } + + bool IsPendingFlush() const; + + size_t GetPendingFlushGap() const; + + void SetLastFlushIndex(LogIndex flushed_logindex, SequenceNumber flushed_seqno) { + auto lastest_flush_log_index = std::max(last_flush_index_.GetLogIndex(), flushed_logindex); + auto lastest_flush_sequence_number = std::max(last_flush_index_.GetSequenceNumber(), flushed_seqno); + last_flush_index_.SetLogIndexSeqnoPair(lastest_flush_log_index, lastest_flush_sequence_number); + } + + // for gtest + LogIndexSeqnoPair &GetLastFlushIndex() { return last_flush_index_; } + + LogIndexPair &GetCFStatus(size_t cf) { return cf_[cf]; } + + private: + std::array cf_; + LogIndexSeqnoPair last_flush_index_; +}; + +class LogIndexAndSequenceCollector { + public: + explicit LogIndexAndSequenceCollector(uint8_t step_length_bit = 0) { step_length_mask_ = (1 << step_length_bit) - 1; } + + // find the index of log which contain seqno or before it + LogIndex FindAppliedLogIndex(SequenceNumber seqno) const; + + // if there's a new pair, add it to list; otherwise, do nothing + void Update(LogIndex smallest_applied_log_index, SequenceNumber smallest_flush_seqno); + + // purge out dated log index after memtable flushed. + void Purge(LogIndex smallest_applied_log_index); + + // Is manual flushing required? + bool IsFlushPending() const { return GetSize() >= max_gap_; } + + // for gtest + uint64_t GetSize() const { + std::shared_lock share_lock; + return list_.size(); + } + + std::deque &GetList() { + std::shared_lock share_lock; + return list_; + } + + public: + static std::atomic_int64_t max_gap_; + + private: + uint64_t step_length_mask_ = 0; + mutable std::shared_mutex mutex_; + std::deque list_; +}; + +class LogIndexTablePropertiesCollector : public rocksdb::TablePropertiesCollector { + public: + static constexpr std::string_view kPropertyName = "LargestLogIndex/LargestSequenceNumber"; + + explicit LogIndexTablePropertiesCollector(const LogIndexAndSequenceCollector &collector) : collector_(collector) {} + + rocksdb::Status AddUserKey(const rocksdb::Slice &key, const rocksdb::Slice &value, rocksdb::EntryType type, + SequenceNumber seq, uint64_t file_size) override { + largest_seqno_ = std::max(largest_seqno_, seq); + return rocksdb::Status::OK(); + } + rocksdb::Status Finish(rocksdb::UserCollectedProperties *properties) override { + properties->insert(Materialize()); + return rocksdb::Status::OK(); + } + const char *Name() const override { return "LogIndexTablePropertiesCollector"; } + rocksdb::UserCollectedProperties GetReadableProperties() const override { + return rocksdb::UserCollectedProperties{Materialize()}; + } + + static std::optional ReadStatsFromTableProps( + const std::shared_ptr &table_props); + + static auto GetLargestLogIndexFromTableCollection(const rocksdb::TablePropertiesCollection &collection) + -> std::optional; + + private: + std::pair Materialize() const { + if (-1 == cache_) { + cache_ = collector_.FindAppliedLogIndex(largest_seqno_); + } + return std::make_pair(static_cast(kPropertyName), fmt::format("{}/{}", cache_, largest_seqno_)); + } + + private: + const LogIndexAndSequenceCollector &collector_; + SequenceNumber largest_seqno_ = 0; + mutable LogIndex cache_{-1}; +}; + +class LogIndexTablePropertiesCollectorFactory : public rocksdb::TablePropertiesCollectorFactory { + public: + explicit LogIndexTablePropertiesCollectorFactory(const LogIndexAndSequenceCollector &collector) + : collector_(collector) {} + ~LogIndexTablePropertiesCollectorFactory() override = default; + + rocksdb::TablePropertiesCollector *CreateTablePropertiesCollector( + [[maybe_unused]] rocksdb::TablePropertiesCollectorFactory::Context context) override { + return new LogIndexTablePropertiesCollector(collector_); + } + const char *Name() const override { return "LogIndexTablePropertiesCollectorFactory"; } + + private: + const LogIndexAndSequenceCollector &collector_; +}; + +class LogIndexAndSequenceCollectorPurger : public rocksdb::EventListener { + public: + explicit LogIndexAndSequenceCollectorPurger(std::vector *column_families, + LogIndexAndSequenceCollector *collector, LogIndexOfColumnFamilies *cf, + std::function callback) + : column_families_(column_families), collector_(collector), cf_(cf), callback_(callback) {} + + void OnFlushCompleted(rocksdb::DB *db, const rocksdb::FlushJobInfo &flush_job_info) override; + + private: + std::vector *column_families_ = nullptr; + LogIndexAndSequenceCollector *collector_ = nullptr; + LogIndexOfColumnFamilies *cf_ = nullptr; + std::atomic_uint64_t count_ = 0; + std::atomic manul_flushing_cf_ = -1; + std::function callback_; +}; + +} // namespace storage \ No newline at end of file diff --git a/src/storage/src/redis.cc b/src/storage/src/redis.cc index c1a0daf16..6417f07bb 100644 --- a/src/storage/src/redis.cc +++ b/src/storage/src/redis.cc @@ -7,13 +7,16 @@ #include "rocksdb/env.h" -#include "config.h" #include "src/base_filter.h" #include "src/lists_filter.h" #include "src/redis.h" #include "src/strings_filter.h" #include "src/zsets_filter.h" +#define ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(type) \ + type##_cf_ops.table_properties_collector_factories.push_back( \ + std::make_shared(log_index_collector_)); + namespace storage { const rocksdb::Comparator* ListsDataKeyComparator() { static ListsDataKeyComparatorImpl ldkc; @@ -48,10 +51,17 @@ Redis::~Redis() { for (auto handle : tmp_handles) { delete handle; } + // delete env_; delete db_; + + if (default_compact_range_options_.canceled) { + delete default_compact_range_options_.canceled; + } } Status Redis::Open(const StorageOptions& storage_options, const std::string& db_path) { + append_log_function_ = storage_options.append_log_function; + raft_timeout_s_ = storage_options.raft_timeout_s; statistics_store_->SetCapacity(storage_options.statistics_max_size); small_compaction_threshold_ = storage_options.small_compaction_threshold; @@ -136,6 +146,24 @@ Status Redis::Open(const StorageOptions& storage_options, const std::string& db_ zset_data_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(zset_data_cf_table_ops)); zset_score_cf_ops.table_factory.reset(rocksdb::NewBlockBasedTableFactory(zset_score_cf_table_ops)); + if (append_log_function_) { + // Add log index table property collector factory to each column family + ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(string); + ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(hash_meta); + ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(hash_data); + ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(list_meta); + ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(list_data); + ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(set_meta); + ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(set_data); + ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(zset_meta); + ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(zset_data); + ADD_TABLE_PROPERTY_COLLECTOR_FACTORY(zset_score); + + // Add a listener on flush to purge log index collector + db_ops.listeners.push_back(std::make_shared( + &handles_, &log_index_collector_, &log_index_of_all_cfs_, storage_options.do_snapshot_function)); + } + std::vector column_families; column_families.emplace_back(rocksdb::kDefaultColumnFamilyName, string_cf_ops); // hash CF @@ -151,7 +179,13 @@ Status Redis::Open(const StorageOptions& storage_options, const std::string& db_ column_families.emplace_back("zset_meta_cf", zset_meta_cf_ops); column_families.emplace_back("zset_data_cf", zset_data_cf_ops); column_families.emplace_back("zset_score_cf", zset_score_cf_ops); - return rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + + auto s = rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + if (!s.ok()) { + return s; + } + assert(!handles_.empty()); + return log_index_of_all_cfs_.Init(this); } Status Redis::GetScanStartPoint(const DataType& type, const Slice& key, const Slice& pattern, int64_t cursor, diff --git a/src/storage/src/redis.h b/src/storage/src/redis.h index e5439042a..b60878c29 100644 --- a/src/storage/src/redis.h +++ b/src/storage/src/redis.h @@ -14,6 +14,7 @@ #include "rocksdb/slice.h" #include "rocksdb/status.h" +#include "log_index.h" #include "pstd/env.h" #include "pstd/log.h" #include "src/custom_comparator.h" @@ -103,6 +104,12 @@ class Redis { const ColumnFamilyType& type = kMetaAndData); virtual Status GetProperty(const std::string& property, uint64_t* out); + bool IsApplied(size_t cf_idx, LogIndex logidx) const { return log_index_of_all_cfs_.IsApplied(cf_idx, logidx); } + void UpdateAppliedLogIndexOfColumnFamily(size_t cf_idx, LogIndex logidx, SequenceNumber seqno) { + log_index_of_all_cfs_.Update(cf_idx, logidx, seqno); + } + bool IsRestarting() const { return is_starting_; } + void StartingPhaseEnd() { is_starting_ = false; } Status ScanKeyNum(std::vector* key_info); Status ScanStringsKeyNum(KeyInfo* key_info); @@ -210,6 +217,10 @@ class Redis { Status SetSmallCompactionThreshold(uint64_t small_compaction_threshold); Status SetSmallCompactionDurationThreshold(uint64_t small_compaction_duration_threshold); void GetRocksDBInfo(std::string& info, const char* prefix); + auto GetWriteOptions() const -> const rocksdb::WriteOptions& { return default_write_options_; } + auto GetColumnFamilyHandles() const -> const std::vector& { return handles_; } + auto GetRaftTimeout() const -> uint32_t { return raft_timeout_s_; } + auto GetAppendLogFunction() const -> const AppendLogFunction& { return append_log_function_; } // Sets Commands Status SAdd(const Slice& key, const std::vector& members, int32_t* ret); @@ -295,6 +306,10 @@ class Redis { void ScanZsets(); void ScanSets(); + void UpdateLogIndex(LogIndex applied_log_index, SequenceNumber seqno) { + log_index_collector_.Update(applied_log_index, seqno); + } + TypeIterator* CreateIterator(const DataType& type, const std::string& pattern, const Slice* lower_bound, const Slice* upper_bound) { return CreateIterator(DataTypeTag[type], pattern, lower_bound, upper_bound); @@ -329,6 +344,10 @@ class Redis { return nullptr; } + LogIndexOfColumnFamilies& GetLogIndexOfColumnFamilies() { return log_index_of_all_cfs_; } + + LogIndexAndSequenceCollector& GetCollector() { return log_index_collector_; } + private: int32_t index_ = 0; Storage* const storage_; @@ -354,6 +373,13 @@ class Redis { std::atomic_uint64_t small_compaction_duration_threshold_; std::unique_ptr> statistics_store_; + // For raft + uint32_t raft_timeout_s_ = 10; + AppendLogFunction append_log_function_; + LogIndexAndSequenceCollector log_index_collector_; + LogIndexOfColumnFamilies log_index_of_all_cfs_; + bool is_starting_{true}; + Status UpdateSpecificKeyStatistics(const DataType& dtype, const std::string& key, uint64_t count); Status UpdateSpecificKeyDuration(const DataType& dtype, const std::string& key, uint64_t duration); Status AddCompactKeyTaskIfNeeded(const DataType& dtype, const std::string& key, uint64_t count, uint64_t duration); diff --git a/src/storage/src/redis_hashes.cc b/src/storage/src/redis_hashes.cc index 9abf51e87..e479e0c26 100644 --- a/src/storage/src/redis_hashes.cc +++ b/src/storage/src/redis_hashes.cc @@ -12,6 +12,7 @@ #include +#include "batch.h" #include "pstd/log.h" #include "src/base_data_key_format.h" #include "src/base_data_value_format.h" @@ -118,7 +119,7 @@ Status Redis::HDel(const Slice& key, const std::vector& fields, int } } - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); rocksdb::ReadOptions read_options; const rocksdb::Snapshot* snapshot; @@ -145,7 +146,7 @@ Status Redis::HDel(const Slice& key, const std::vector& fields, int if (s.ok()) { del_cnt++; statistic++; - batch.Delete(handles_[kHashesDataCF], hashes_data_key.Encode()); + batch->Delete(kHashesDataCF, hashes_data_key.Encode()); } else if (s.IsNotFound()) { continue; } else { @@ -157,7 +158,7 @@ Status Redis::HDel(const Slice& key, const std::vector& fields, int return Status::InvalidArgument("hash size overflow"); } parsed_hashes_meta_value.ModifyCount(-del_cnt); - batch.Put(handles_[kHashesMetaCF], base_meta_key.Encode(), meta_value); + batch->Put(kHashesMetaCF, base_meta_key.Encode(), meta_value); } } else if (s.IsNotFound()) { *ret = 0; @@ -165,7 +166,7 @@ Status Redis::HDel(const Slice& key, const std::vector& fields, int } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(); UpdateSpecificKeyStatistics(DataType::kHashes, key.ToString(), statistic); return s; } @@ -616,7 +617,7 @@ Status Redis::HMSet(const Slice& key, const std::vector& fvs) { } Status Redis::HSet(const Slice& key, const Slice& field, const Slice& value, int32_t* res) { - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); uint64_t version = 0; @@ -631,10 +632,10 @@ Status Redis::HSet(const Slice& key, const Slice& field, const Slice& value, int if (parsed_hashes_meta_value.IsStale() || parsed_hashes_meta_value.Count() == 0) { version = parsed_hashes_meta_value.InitialMetaValue(); parsed_hashes_meta_value.SetCount(1); - batch.Put(handles_[kHashesMetaCF], base_meta_key.Encode(), meta_value); + batch->Put(kHashesMetaCF, base_meta_key.Encode(), meta_value); HashesDataKey data_key(key, version, field); BaseDataValue internal_value(value); - batch.Put(handles_[kHashesDataCF], data_key.Encode(), internal_value.Encode()); + batch->Put(kHashesDataCF, data_key.Encode(), internal_value.Encode()); *res = 1; } else { version = parsed_hashes_meta_value.Version(); @@ -647,7 +648,7 @@ Status Redis::HSet(const Slice& key, const Slice& field, const Slice& value, int return Status::OK(); } else { BaseDataValue internal_value(value); - batch.Put(handles_[kHashesDataCF], hashes_data_key.Encode(), internal_value.Encode()); + batch->Put(kHashesDataCF, hashes_data_key.Encode(), internal_value.Encode()); statistic++; } } else if (s.IsNotFound()) { @@ -656,8 +657,8 @@ Status Redis::HSet(const Slice& key, const Slice& field, const Slice& value, int } parsed_hashes_meta_value.ModifyCount(1); BaseDataValue internal_value(value); - batch.Put(handles_[kHashesMetaCF], base_meta_key.Encode(), meta_value); - batch.Put(handles_[kHashesDataCF], hashes_data_key.Encode(), internal_value.Encode()); + batch->Put(kHashesMetaCF, base_meta_key.Encode(), meta_value); + batch->Put(kHashesDataCF, hashes_data_key.Encode(), internal_value.Encode()); *res = 1; } else { return s; @@ -667,15 +668,15 @@ Status Redis::HSet(const Slice& key, const Slice& field, const Slice& value, int EncodeFixed32(meta_value_buf, 1); HashesMetaValue meta_value(Slice(meta_value_buf, sizeof(int32_t))); version = meta_value.UpdateVersion(); - batch.Put(handles_[kHashesMetaCF], base_meta_key.Encode(), meta_value.Encode()); + batch->Put(kHashesMetaCF, base_meta_key.Encode(), meta_value.Encode()); HashesDataKey data_key(key, version, field); BaseDataValue internal_value(value); - batch.Put(handles_[kHashesDataCF], data_key.Encode(), internal_value.Encode()); + batch->Put(kHashesDataCF, data_key.Encode(), internal_value.Encode()); *res = 1; } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(); UpdateSpecificKeyStatistics(DataType::kHashes, key.ToString(), statistic); return s; } diff --git a/src/storage/src/redis_lists.cc b/src/storage/src/redis_lists.cc index 2ec1da18e..2a641da5e 100644 --- a/src/storage/src/redis_lists.cc +++ b/src/storage/src/redis_lists.cc @@ -8,6 +8,7 @@ #include #include "pstd/log.h" #include "src/base_data_value_format.h" +#include "src/batch.h" #include "src/lists_filter.h" #include "src/redis.h" #include "src/scope_record_lock.h" @@ -263,7 +264,7 @@ Status Redis::LPop(const Slice& key, int64_t count, std::vector* el uint32_t statistic = 0; elements->clear(); - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); std::string meta_value; @@ -288,20 +289,17 @@ Status Redis::LPop(const Slice& key, int64_t count, std::vector* el statistic++; ParsedBaseDataValue parsed_base_data_value(iter->value()); elements->push_back(parsed_base_data_value.UserValue().ToString()); - batch.Delete(handles_[kListsDataCF], iter->key()); + batch->Delete(kListsDataCF, iter->key()); parsed_lists_meta_value.ModifyCount(-1); parsed_lists_meta_value.ModifyLeftIndex(-1); } - batch.Put(handles_[kListsMetaCF], base_meta_key.Encode(), meta_value); + batch->Put(kListsMetaCF, base_meta_key.Encode(), meta_value); delete iter; } } - if (batch.Count() != 0U) { - s = db_->Write(default_write_options_, &batch); - if (s.ok()) { - batch.Clear(); - } + if (batch->Count() != 0U) { + s = batch->Commit(); UpdateSpecificKeyStatistics(DataType::kLists, key.ToString(), statistic); } return s; @@ -309,7 +307,7 @@ Status Redis::LPop(const Slice& key, int64_t count, std::vector* el Status Redis::LPush(const Slice& key, const std::vector& values, uint64_t* ret) { *ret = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); uint64_t index = 0; @@ -331,9 +329,9 @@ Status Redis::LPush(const Slice& key, const std::vector& values, ui parsed_lists_meta_value.ModifyCount(1); ListsDataKey lists_data_key(key, version, index); BaseDataValue i_val(value); - batch.Put(handles_[kListsDataCF], lists_data_key.Encode(), i_val.Encode()); + batch->Put(kListsDataCF, lists_data_key.Encode(), i_val.Encode()); } - batch.Put(handles_[kListsMetaCF], base_meta_key.Encode(), meta_value); + batch->Put(kListsMetaCF, base_meta_key.Encode(), meta_value); *ret = parsed_lists_meta_value.Count(); } else if (s.IsNotFound()) { char str[8]; @@ -345,14 +343,14 @@ Status Redis::LPush(const Slice& key, const std::vector& values, ui lists_meta_value.ModifyLeftIndex(1); ListsDataKey lists_data_key(key, version, index); BaseDataValue i_val(value); - batch.Put(handles_[kListsDataCF], lists_data_key.Encode(), i_val.Encode()); + batch->Put(kListsDataCF, lists_data_key.Encode(), i_val.Encode()); } - batch.Put(handles_[kListsMetaCF], base_meta_key.Encode(), lists_meta_value.Encode()); + batch->Put(kListsMetaCF, base_meta_key.Encode(), lists_meta_value.Encode()); *ret = lists_meta_value.RightIndex() - lists_meta_value.LeftIndex() - 1; } else { return s; } - return db_->Write(default_write_options_, &batch); + return batch->Commit(); } Status Redis::LPushx(const Slice& key, const std::vector& values, uint64_t* len) { diff --git a/src/storage/src/redis_sets.cc b/src/storage/src/redis_sets.cc index a3bccd1d2..c7541fcb6 100644 --- a/src/storage/src/redis_sets.cc +++ b/src/storage/src/redis_sets.cc @@ -3,6 +3,7 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#include "src/batch.h" #include "src/redis.h" #include @@ -116,7 +117,7 @@ rocksdb::Status Redis::SAdd(const Slice& key, const std::vector& me } } - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); uint64_t version = 0; std::string meta_value; @@ -131,11 +132,11 @@ rocksdb::Status Redis::SAdd(const Slice& key, const std::vector& me return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.SetCount(static_cast(filtered_members.size())); - batch.Put(handles_[kSetsMetaCF], base_meta_key.Encode(), meta_value); + batch->Put(kSetsMetaCF, base_meta_key.Encode(), meta_value); for (const auto& member : filtered_members) { SetsMemberKey sets_member_key(key, version, member); BaseDataValue iter_value(Slice{}); - batch.Put(handles_[kSetsDataCF], sets_member_key.Encode(), iter_value.Encode()); + batch->Put(kSetsDataCF, sets_member_key.Encode(), iter_value.Encode()); } *ret = static_cast(filtered_members.size()); } else { @@ -149,7 +150,7 @@ rocksdb::Status Redis::SAdd(const Slice& key, const std::vector& me } else if (s.IsNotFound()) { cnt++; BaseDataValue iter_value(Slice{}); - batch.Put(handles_[kSetsDataCF], sets_member_key.Encode(), iter_value.Encode()); + batch->Put(kSetsDataCF, sets_member_key.Encode(), iter_value.Encode()); } else { return s; } @@ -162,7 +163,7 @@ rocksdb::Status Redis::SAdd(const Slice& key, const std::vector& me return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.ModifyCount(cnt); - batch.Put(handles_[kSetsMetaCF], base_meta_key.Encode(), meta_value); + batch->Put(kSetsMetaCF, base_meta_key.Encode(), meta_value); } } } else if (s.IsNotFound()) { @@ -170,17 +171,17 @@ rocksdb::Status Redis::SAdd(const Slice& key, const std::vector& me EncodeFixed32(str, filtered_members.size()); SetsMetaValue sets_meta_value(Slice(str, sizeof(int32_t))); version = sets_meta_value.UpdateVersion(); - batch.Put(handles_[kSetsMetaCF], base_meta_key.Encode(), sets_meta_value.Encode()); + batch->Put(kSetsMetaCF, base_meta_key.Encode(), sets_meta_value.Encode()); for (const auto& member : filtered_members) { SetsMemberKey sets_member_key(key, version, member); BaseDataValue i_val(Slice{}); - batch.Put(handles_[kSetsDataCF], sets_member_key.Encode(), i_val.Encode()); + batch->Put(kSetsDataCF, sets_member_key.Encode(), i_val.Encode()); } *ret = static_cast(filtered_members.size()); } else { return s; } - return db_->Write(default_write_options_, &batch); + return batch->Commit(); } rocksdb::Status Redis::SCard(const Slice& key, int32_t* ret) { @@ -939,7 +940,7 @@ rocksdb::Status Redis::SRandmember(const Slice& key, int32_t count, std::vector< rocksdb::Status Redis::SRem(const Slice& key, const std::vector& members, int32_t* ret) { *ret = 0; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); uint64_t version = 0; @@ -964,7 +965,7 @@ rocksdb::Status Redis::SRem(const Slice& key, const std::vector& me if (s.ok()) { cnt++; statistic++; - batch.Delete(handles_[kSetsDataCF], sets_member_key.Encode()); + batch->Delete(kSetsDataCF, sets_member_key.Encode()); } else if (s.IsNotFound()) { } else { return s; @@ -975,7 +976,7 @@ rocksdb::Status Redis::SRem(const Slice& key, const std::vector& me return Status::InvalidArgument("set size overflow"); } parsed_sets_meta_value.ModifyCount(-cnt); - batch.Put(handles_[kSetsMetaCF], base_meta_key.Encode(), meta_value); + batch->Put(kSetsMetaCF, base_meta_key.Encode(), meta_value); } } else if (s.IsNotFound()) { *ret = 0; @@ -983,7 +984,7 @@ rocksdb::Status Redis::SRem(const Slice& key, const std::vector& me } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(); UpdateSpecificKeyStatistics(DataType::kSets, key.ToString(), statistic); return s; } diff --git a/src/storage/src/redis_strings.cc b/src/storage/src/redis_strings.cc index 51d6872c6..24a677921 100644 --- a/src/storage/src/redis_strings.cc +++ b/src/storage/src/redis_strings.cc @@ -3,17 +3,18 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. -#include #include #include #include "pstd/log.h" #include "src/base_key_format.h" +#include "src/batch.h" #include "src/redis.h" #include "src/scope_record_lock.h" #include "src/scope_snapshot.h" #include "src/strings_filter.h" +#include "storage/storage_define.h" #include "storage/util.h" namespace storage { @@ -630,10 +631,12 @@ Status Redis::MSetnx(const std::vector& kvs, int32_t* ret) { Status Redis::Set(const Slice& key, const Slice& value) { StringsValue strings_value(value); + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); BaseKey base_key(key); - return db_->Put(default_write_options_, base_key.Encode(), strings_value.Encode()); + batch->Put(kStringsCF, base_key.Encode(), strings_value.Encode()); + return batch->Commit(); } Status Redis::Setxx(const Slice& key, const Slice& value, int32_t* ret, const uint64_t ttl) { diff --git a/src/storage/src/redis_zsets.cc b/src/storage/src/redis_zsets.cc index ea4864ae1..3532b4cca 100644 --- a/src/storage/src/redis_zsets.cc +++ b/src/storage/src/redis_zsets.cc @@ -14,6 +14,7 @@ #include "pstd/log.h" #include "src/base_data_value_format.h" #include "src/base_key_format.h" +#include "src/batch.h" #include "src/redis.h" #include "src/scope_record_lock.h" #include "src/scope_snapshot.h" @@ -216,7 +217,7 @@ Status Redis::ZAdd(const Slice& key, const std::vector& score_membe char score_buf[8]; uint64_t version = 0; std::string meta_value; - rocksdb::WriteBatch batch; + auto batch = Batch::CreateBatch(this); ScopeRecordLock l(lock_mgr_, key); BaseMetaKey base_meta_key(key); @@ -250,7 +251,7 @@ Status Redis::ZAdd(const Slice& key, const std::vector& score_membe continue; } else { ZSetsScoreKey zsets_score_key(key, version, old_score, sm.member); - batch.Delete(handles_[kZsetsScoreCF], zsets_score_key.Encode()); + batch->Delete(kZsetsScoreCF, zsets_score_key.Encode()); // delete old zsets_score_key and overwirte zsets_member_key // but in different column_families so we accumulative 1 statistic++; @@ -263,11 +264,11 @@ Status Redis::ZAdd(const Slice& key, const std::vector& score_membe const void* ptr_score = reinterpret_cast(&sm.score); EncodeFixed64(score_buf, *reinterpret_cast(ptr_score)); BaseDataValue zsets_member_i_val(Slice(score_buf, sizeof(uint64_t))); - batch.Put(handles_[kZsetsDataCF], zsets_member_key.Encode(), zsets_member_i_val.Encode()); + batch->Put(kZsetsDataCF, zsets_member_key.Encode(), zsets_member_i_val.Encode()); ZSetsScoreKey zsets_score_key(key, version, sm.score, sm.member); BaseDataValue zsets_score_i_val(Slice{}); - batch.Put(handles_[kZsetsScoreCF], zsets_score_key.Encode(), zsets_score_i_val.Encode()); + batch->Put(kZsetsScoreCF, zsets_score_key.Encode(), zsets_score_i_val.Encode()); if (not_found) { cnt++; } @@ -276,30 +277,30 @@ Status Redis::ZAdd(const Slice& key, const std::vector& score_membe return Status::InvalidArgument("zset size overflow"); } parsed_zsets_meta_value.ModifyCount(cnt); - batch.Put(handles_[kZsetsMetaCF], base_meta_key.Encode(), meta_value); + batch->Put(kZsetsMetaCF, base_meta_key.Encode(), meta_value); *ret = cnt; } else if (s.IsNotFound()) { char buf[4]; EncodeFixed32(buf, filtered_score_members.size()); ZSetsMetaValue zsets_meta_value(Slice(buf, sizeof(int32_t))); version = zsets_meta_value.UpdateVersion(); - batch.Put(handles_[kZsetsMetaCF], base_meta_key.Encode(), zsets_meta_value.Encode()); + batch->Put(kZsetsMetaCF, base_meta_key.Encode(), zsets_meta_value.Encode()); for (const auto& sm : filtered_score_members) { ZSetsMemberKey zsets_member_key(key, version, sm.member); const void* ptr_score = reinterpret_cast(&sm.score); EncodeFixed64(score_buf, *reinterpret_cast(ptr_score)); BaseDataValue zsets_member_i_val(Slice(score_buf, sizeof(uint64_t))); - batch.Put(handles_[kZsetsDataCF], zsets_member_key.Encode(), zsets_member_i_val.Encode()); + batch->Put(kZsetsDataCF, zsets_member_key.Encode(), zsets_member_i_val.Encode()); ZSetsScoreKey zsets_score_key(key, version, sm.score, sm.member); BaseDataValue zsets_score_i_val(Slice{}); - batch.Put(handles_[kZsetsScoreCF], zsets_score_key.Encode(), zsets_score_i_val.Encode()); + batch->Put(kZsetsScoreCF, zsets_score_key.Encode(), zsets_score_i_val.Encode()); } *ret = static_cast(filtered_score_members.size()); } else { return s; } - s = db_->Write(default_write_options_, &batch); + s = batch->Commit(); UpdateSpecificKeyStatistics(DataType::kZSets, key.ToString(), statistic); return s; } diff --git a/src/storage/src/storage.cc b/src/storage/src/storage.cc index b46601c8b..342f7aeb7 100644 --- a/src/storage/src/storage.cc +++ b/src/storage/src/storage.cc @@ -4,11 +4,18 @@ // of patent rights can be found in the PATENTS file in the same directory. #include +#include +#include +#include #include +#include +#include "binlog.pb.h" #include "config.h" #include "pstd/log.h" #include "pstd/pikiwidb_slot.h" +#include "pstd/pstd_string.h" +#include "rocksdb/utilities/checkpoint.h" #include "scope_snapshot.h" #include "src/lru_cache.h" #include "src/mutex_impl.h" @@ -20,6 +27,9 @@ #include "storage/storage.h" #include "storage/util.h" +#define PRAFT_SNAPSHOT_META_FILE "__raft_snapshot_meta" +#define SST_FILE_EXTENSION ".sst" + namespace storage { extern std::string BitOpOperate(BitOpType op, const std::vector& src_values, int64_t max_len); class Redis; @@ -61,13 +71,14 @@ Storage::Storage() { } Storage::~Storage() { - bg_tasks_should_exit_ = true; + bg_tasks_should_exit_.store(true); bg_tasks_cond_var_.notify_one(); - - if (is_opened_) { - for (auto& inst : insts_) { - inst.reset(); + if (is_opened_.load()) { + int ret = 0; + if (ret = pthread_join(bg_tasks_thread_id_, nullptr); ret != 0) { + ERROR("pthread_join failed with bgtask thread error : {}", ret); } + insts_.clear(); } } @@ -79,9 +90,49 @@ static std::string AppendSubDirectory(const std::string& db_path, int index) { } } +static int RecursiveLinkAndCopy(const std::filesystem::path& source, const std::filesystem::path& destination) { + if (std::filesystem::is_regular_file(source)) { + if (source.filename() == PRAFT_SNAPSHOT_META_FILE) { + return 0; + } else if (source.extension() == SST_FILE_EXTENSION) { + // Create a hard link + if (::link(source.c_str(), destination.c_str()) != 0) { + WARN("hard link file {} fail", source.string()); + return -1; + } + DEBUG("hard link success! source_file = {} , destination_file = {}", source.string(), destination.string()); + } else { + // Copy the file + if (!std::filesystem::copy_file(source, destination, std::filesystem::copy_options::overwrite_existing)) { + WARN("copy file {} fail", source.string()); + return -1; + } + DEBUG("copy success! source_file = {} , destination_file = {}", source.string(), destination.string()); + } + } else { + if (!pstd::FileExists(destination)) { + if (pstd::CreateDir(destination) != 0) { + WARN("create dir {} fail", destination.string()); + return -1; + } + } + + for (const auto& entry : std::filesystem::directory_iterator(source)) { + if (RecursiveLinkAndCopy(entry.path(), destination / entry.path().filename()) != 0) { + return -1; + } + } + } + return 0; +} + Status Storage::Open(const StorageOptions& storage_options, const std::string& db_path) { mkpath(db_path.c_str(), 0755); db_instance_num_ = storage_options.db_instance_num; + // Temporarily set to 100000 + LogIndexAndSequenceCollector::max_gap_.store(storage_options.max_gap); + storage_options.options.write_buffer_manager = + std::make_shared(storage_options.mem_manager_size); for (size_t index = 0; index < db_instance_num_; index++) { insts_.emplace_back(std::make_unique(this, index)); Status s = insts_.back()->Open(storage_options, AppendSubDirectory(db_path, index)); @@ -99,6 +150,119 @@ Status Storage::Open(const StorageOptions& storage_options, const std::string& d return Status::OK(); } +std::vector> Storage::CreateCheckpoint(const std::string& checkpoint_path) { + INFO("DB{} begin to generate a checkpoint to {}", db_id_, checkpoint_path); + // auto source_dir = AppendSubDirectory(checkpoint_path, db_id_); + + std::vector> result; + result.reserve(db_instance_num_); + for (int i = 0; i < db_instance_num_; ++i) { + // In a new thread, create a checkpoint for the specified rocksdb i. + auto res = std::async(std::launch::async, &Storage::CreateCheckpointInternal, this, checkpoint_path, i); + result.push_back(std::move(res)); + } + return result; +} + +Status Storage::CreateCheckpointInternal(const std::string& checkpoint_path, int index) { + auto source_dir = AppendSubDirectory(checkpoint_path, index); + + auto tmp_dir = source_dir + ".tmp"; + // 1) Make sure the temporary directory does not exist + if (!pstd::DeleteDirIfExist(tmp_dir)) { + WARN("DB{}'s RocksDB {} delete directory fail!", db_id_, index); + return Status::IOError("DeleteDirIfExist() fail! dir_name : {} ", tmp_dir); + } + + // 2) Create checkpoint object of this RocksDB + rocksdb::Checkpoint* checkpoint = nullptr; + auto db = insts_[index]->GetDB(); + rocksdb::Status s = rocksdb::Checkpoint::Create(db, &checkpoint); + if (!s.ok()) { + WARN("DB{}'s RocksDB {} create checkpoint object failed!. Error: ", db_id_, index, s.ToString()); + return s; + } + + // 3) Create a checkpoint + std::unique_ptr checkpoint_guard(checkpoint); + s = checkpoint->CreateCheckpoint(tmp_dir, kFlush, nullptr); + if (!s.ok()) { + WARN("DB{}'s RocksDB {} create checkpoint failed!. Error: {}", db_id_, index, s.ToString()); + return s; + } + + // 4) Make sure the source directory does not exist + if (!pstd::DeleteDirIfExist(source_dir)) { + WARN("DB{}'s RocksDB {} delete directory {} fail!", db_id_, index, source_dir); + if (!pstd::DeleteDirIfExist(tmp_dir)) { + WARN("DB{}'s RocksDB {} fail to delete the temporary directory {} ", db_id_, index, tmp_dir); + } + return Status::IOError("DeleteDirIfExist() fail! dir_name : {} ", source_dir); + } + + // 5) Rename the temporary directory to source directory + if (auto status = pstd::RenameFile(tmp_dir, source_dir); status != 0) { + WARN("DB{}'s RocksDB {} rename temporary directory {} to source directory {} fail!", db_id_, index, tmp_dir, + source_dir); + if (!pstd::DeleteDirIfExist(tmp_dir)) { + WARN("DB{}'s RocksDB {} fail to delete the rename failed directory {} ", db_id_, index, tmp_dir); + } + return Status::IOError("Rename directory {} fail!", tmp_dir); + } + + INFO("DB{}'s RocksDB {} create checkpoint {} success!", db_id_, index, source_dir); + return Status::OK(); +} + +std::vector> Storage::LoadCheckpoint(const std::string& checkpoint_sub_path, + const std::string& db_sub_path) { + INFO("DB{} begin to load a checkpoint from {} to {}", db_id_, checkpoint_sub_path, db_sub_path); + std::vector> result; + result.reserve(db_instance_num_); + for (int i = 0; i < db_instance_num_; ++i) { + // In a new thread, Load a checkpoint for the specified rocksdb i + auto res = + std::async(std::launch::async, &Storage::LoadCheckpointInternal, this, checkpoint_sub_path, db_sub_path, i); + result.push_back(std::move(res)); + } + return result; +} + +Status Storage::LoadCheckpointInternal(const std::string& checkpoint_sub_path, const std::string& db_sub_path, + int index) { + auto rocksdb_path = AppendSubDirectory(db_sub_path, index); // ./db/db_id/index + auto tmp_rocksdb_path = rocksdb_path + ".tmp"; // ./db/db_id/index.tmp + insts_[index].reset(); + + auto source_dir = AppendSubDirectory(checkpoint_sub_path, index); + // 1) Rename the original db to db.tmp, and only perform the maximum possible recovery of data + // when loading the checkpoint fails. + if (auto status = pstd::RenameFile(rocksdb_path, tmp_rocksdb_path); status != 0) { + WARN("DB{}'s RocksDB {} rename db directory {} to temporary directory {} fail!", db_id_, index, rocksdb_path, + tmp_rocksdb_path); + return Status::IOError("Rename directory {} fail!", rocksdb_path); + } + + // 2) Create a db directory to save the checkpoint. + if (0 != pstd::CreatePath(rocksdb_path)) { + pstd::RenameFile(tmp_rocksdb_path, rocksdb_path); + WARN("DB{}'s RocksDB {} load a checkpoint from {} fail!", db_id_, index, checkpoint_sub_path); + return Status::IOError("Create directory {} fail!", rocksdb_path); + } + if (RecursiveLinkAndCopy(source_dir, rocksdb_path) != 0) { + pstd::DeleteDir(rocksdb_path); + pstd::RenameFile(tmp_rocksdb_path, rocksdb_path); + WARN("DB{}'s RocksDB {} load a checkpoint from {} fail!", db_id_, index, source_dir); + return Status::IOError("recursive link and copy directory {} fail!", rocksdb_path); + } + + // 3) Destroy the db.tmp directory. + if (auto s = rocksdb::DestroyDB(tmp_rocksdb_path, rocksdb::Options()); !s.ok()) { + WARN("Failure to destroy the old DB, path = {}", tmp_rocksdb_path); + } + return Status::OK(); +} + Status Storage::LoadCursorStartKey(const DataType& dtype, int64_t cursor, char* type, std::string* start_key) { std::string index_key = DataTypeTag[dtype] + std::to_string(cursor); std::string index_value; @@ -1923,9 +2087,9 @@ Status Storage::AddBGTask(const BGTask& bg_task) { Status Storage::RunBGTask() { BGTask task; - while (!bg_tasks_should_exit_) { + while (!bg_tasks_should_exit_.load()) { std::unique_lock lock(bg_tasks_mutex_); - bg_tasks_cond_var_.wait(lock, [this]() { return !bg_tasks_queue_.empty() || bg_tasks_should_exit_; }); + bg_tasks_cond_var_.wait(lock, [this]() { return !bg_tasks_queue_.empty() || bg_tasks_should_exit_.load(); }); if (!bg_tasks_queue_.empty()) { task = bg_tasks_queue_.front(); @@ -1933,7 +2097,7 @@ Status Storage::RunBGTask() { } lock.unlock(); - if (bg_tasks_should_exit_) { + if (bg_tasks_should_exit_.load()) { return Status::Incomplete("bgtask return with bg_tasks_should_exit true"); } @@ -2199,4 +2363,49 @@ void Storage::DisableWal(const bool is_wal_disable) { } } +Status Storage::OnBinlogWrite(const pikiwidb::Binlog& log, LogIndex log_idx) { + auto& inst = insts_[log.slot_idx()]; + + rocksdb::WriteBatch batch; + bool is_finished_start = true; + auto seqno = inst->GetDB()->GetLatestSequenceNumber(); + for (const auto& entry : log.entries()) { + if (inst->IsRestarting() && inst->IsApplied(entry.cf_idx(), log_idx)) [[unlikely]] { + // If the starting phase is over, the log must not have been applied + // If the starting phase is not over and the log has been applied, skip it. + WARN("Log {} has been applied", log_idx); + is_finished_start = false; + continue; + } + + switch (entry.op_type()) { + case pikiwidb::OperateType::kPut: { + assert(entry.has_value()); + batch.Put(inst->GetColumnFamilyHandles()[entry.cf_idx()], entry.key(), entry.value()); + } break; + case pikiwidb::OperateType::kDelete: { + assert(!entry.has_value()); + batch.Delete(inst->GetColumnFamilyHandles()[entry.cf_idx()], entry.key()); + } break; + default: + static constexpr std::string_view msg = "Unknown operate type in binlog"; + ERROR(msg); + return Status::Incomplete(msg); + } + inst->UpdateAppliedLogIndexOfColumnFamily(entry.cf_idx(), log_idx, ++seqno); + } + if (inst->IsRestarting() && is_finished_start) [[unlikely]] { + INFO("Redis {} finished start phase", inst->GetIndex()); + inst->StartingPhaseEnd(); + } + auto first_seqno = inst->GetDB()->GetLatestSequenceNumber() + 1; + auto s = inst->GetDB()->Write(inst->GetWriteOptions(), &batch); + if (!s.ok()) { + // TODO(longfar): What we should do if the write operation failed ? 💥 + return s; + } + inst->UpdateLogIndex(log_idx, first_seqno); + return s; +} + } // namespace storage diff --git a/src/storage/src/storage_murmur3.h b/src/storage/src/storage_murmur3.h index 958c5dbf1..f899a86cd 100644 --- a/src/storage/src/storage_murmur3.h +++ b/src/storage/src/storage_murmur3.h @@ -148,4 +148,4 @@ extern } // namespace storage -#endif +#endif \ No newline at end of file diff --git a/src/storage/tests/CMakeLists.txt b/src/storage/tests/CMakeLists.txt new file mode 100644 index 000000000..2c8f258f5 --- /dev/null +++ b/src/storage/tests/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright (c) 2023-present, Qihoo, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +INCLUDE(GoogleTest) + +FILE(GLOB_RECURSE TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*test.cc") + +FOREACH (TEST_SOURCE ${TEST_SOURCES}) + GET_FILENAME_COMPONENT(TEST_FILENAME ${TEST_SOURCE} NAME) + STRING(REPLACE ".cc" "" TEST_NAME ${TEST_FILENAME}) + + ADD_EXECUTABLE(${TEST_NAME} ${TEST_SOURCE}) + + TARGET_INCLUDE_DIRECTORIES(${TEST_NAME} + PUBLIC storage + PRIVATE ${rocksdb_SOURCE_DIR} + PRIVATE ${rocksdb_SOURCE_DIR}/include + PRIVATE ${BRAFT_INCLUDE_DIR} + PRIVATE ${BRPC_INCLUDE_DIR} + ) + TARGET_LINK_LIBRARIES(${TEST_NAME} + PUBLIC storage + PRIVATE gtest + PRIVATE gtest_main + PRIVATE fmt + ${LIB} + ) +ENDFOREACH() diff --git a/src/storage/tests/flush_oldest_cf_test.cc b/src/storage/tests/flush_oldest_cf_test.cc new file mode 100644 index 000000000..2558dfc11 --- /dev/null +++ b/src/storage/tests/flush_oldest_cf_test.cc @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#include "gtest/gtest.h" + +#include +#include +#include +#include +#include +#include + +#include "fmt/core.h" +#include "gtest/gtest.h" +#include "rocksdb/db.h" +#include "rocksdb/listener.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" + +#include "pstd/log.h" +#include "pstd/thread_pool.h" +#include "src/log_index.h" +#include "src/redis.h" +#include "storage/storage.h" +#include "storage/util.h" + +class LogIniter { + public: + LogIniter() { + logger::Init("./flush_oldest_cf_test.log"); + spdlog::set_level(spdlog::level::info); + } +}; + +LogIniter log_initer; + +using LogIndex = int64_t; + +class LogQueue : public pstd::noncopyable { + public: + using WriteCallback = std::function; + + explicit LogQueue(WriteCallback&& cb) : write_cb_(std::move(cb)) { consumer_.SetMaxIdleThread(1); } + + void AppendLog(const pikiwidb::Binlog& log, std::promise&& promise) { + auto task = [&] { + auto idx = next_log_idx_.fetch_add(1); + auto s = write_cb_(log, idx); + promise.set_value(s); + }; + consumer_.ExecuteTask(std::move(task)); + } + + private: + WriteCallback write_cb_ = nullptr; + pstd::ThreadPool consumer_; + std::atomic next_log_idx_{1}; +}; + +class FlushOldestCFTest : public ::testing::Test { + public: + FlushOldestCFTest() + : log_queue_([this](const pikiwidb::Binlog& log, LogIndex log_idx) { return db_.OnBinlogWrite(log, log_idx); }) { + options_.options.create_if_missing = true; + options_.options.max_background_jobs = 10; + options_.db_instance_num = 1; + options_.raft_timeout_s = 9000000; + options_.append_log_function = [this](const pikiwidb::Binlog& log, std::promise&& promise) { + log_queue_.AppendLog(log, std::move(promise)); + }; + options_.do_snapshot_function = [](int64_t log_index, bool sync) {}; + options_.max_gap = 15; + write_options_.disableWAL = true; + } + + ~FlushOldestCFTest() { rocksdb::DestroyDB(db_path_, rocksdb::Options()); } + + void SetUp() override { + if (access(db_path_.c_str(), F_OK) == 0) { + std::filesystem::remove_all(db_path_.c_str()); + } + mkdir(db_path_.c_str(), 0755); + auto s = db_.Open(options_, db_path_); + ASSERT_TRUE(s.ok()); + } + + std::string db_path_{"./test_db/flush_oldest_cf_test"}; + storage::StorageOptions options_; + storage::Storage db_; + uint32_t test_times_ = 100; + std::string key_ = "flush-oldest-cf-test"; + std::string key_prefix = "key_"; + std::string field_prefix_ = "field_"; + std::string value_prefix_ = "value_"; + rocksdb::WriteOptions write_options_; + rocksdb::ReadOptions read_options_; + LogQueue log_queue_; +}; + +TEST_F(FlushOldestCFTest, SimpleTest) { + const auto& rocksdb = db_.GetDBInstance(key_); + + auto add_kvs = [&](int start, int end) { + for (int i = start; i < end; i++) { + auto key = key_prefix + std::to_string(i); + auto v = value_prefix_ + std::to_string(i); + auto s = rocksdb->Set(key, v); + ASSERT_TRUE(s.ok()); + } + }; + + auto add_hash = [&](int start, int end) { + for (int i = start; i < end; i++) { + auto key = key_prefix + std::to_string(i); + auto v = value_prefix_ + std::to_string(i); + auto f = field_prefix_ + std::to_string(i); + int32_t res{}; + auto s = rocksdb->HSet(key, v, f, &res); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(res, 1); + } + }; + + auto flush_cf = [&](size_t cf) { + auto s = rocksdb->GetDB()->Flush(rocksdb::FlushOptions(), rocksdb->GetColumnFamilyHandles()[cf]); + ASSERT_TRUE(s.ok()); + }; + + { + // type kv kv + // entry [1:1] -> ... [10:10] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 0 0 10 10 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 0 0 + add_kvs(0, 10); + auto& last_flush_index = rocksdb->GetLogIndexOfColumnFamilies().GetLastFlushIndex(); + ASSERT_EQ(last_flush_index.log_index.load(), 0); + ASSERT_EQ(last_flush_index.seqno.load(), 0); + + auto& cf_0_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kStringsCF); + ASSERT_EQ(cf_0_status.flushed_index.log_index, 0); + ASSERT_EQ(cf_0_status.flushed_index.seqno, 0); + ASSERT_EQ(cf_0_status.applied_index.log_index, 10); + ASSERT_EQ(cf_0_status.applied_index.seqno, 10); + + auto [smallest_applied_log_index_cf, smallest_applied_log_index, smallest_flushed_log_index_cf, + smallest_flushed_log_index, smallest_flushed_seqno] = + rocksdb->GetLogIndexOfColumnFamilies().GetSmallestLogIndex(-1); + + ASSERT_EQ(smallest_flushed_log_index, 0); + ASSERT_EQ(smallest_flushed_seqno, 0); + ASSERT_EQ(smallest_applied_log_index, 10); + auto size = rocksdb->GetCollector().GetSize(); + ASSERT_EQ(size, 10); + } + + { + // type kv kv hash hash hash + // entry [1:1] -> ... [10:10] -> [11:11] -> [12:13] -> ... -> [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 0 0 10 10 + // 1 0 0 30 49 + // 2 0 0 30 50 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 0 0 + add_hash(10, 30); + auto& last_flush_index = rocksdb->GetLogIndexOfColumnFamilies().GetLastFlushIndex(); + ASSERT_EQ(last_flush_index.log_index.load(), 0); + ASSERT_EQ(last_flush_index.seqno.load(), 0); + + auto& cf_1_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kHashesMetaCF); + ASSERT_EQ(cf_1_status.flushed_index.log_index, 0); + ASSERT_EQ(cf_1_status.flushed_index.seqno, 0); + ASSERT_EQ(cf_1_status.applied_index.log_index, 30); + ASSERT_EQ(cf_1_status.applied_index.seqno, 49); + + auto& cf_2_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kHashesDataCF); + ASSERT_EQ(cf_2_status.flushed_index.log_index, 0); + ASSERT_EQ(cf_2_status.flushed_index.seqno, 0); + ASSERT_EQ(cf_2_status.applied_index.log_index, 30); + ASSERT_EQ(cf_2_status.applied_index.seqno, 50); + + auto [smallest_applied_log_index_cf, smallest_applied_log_index, smallest_flushed_log_index_cf, + smallest_flushed_log_index, smallest_flushed_seqno] = + rocksdb->GetLogIndexOfColumnFamilies().GetSmallestLogIndex(-1); + + ASSERT_EQ(smallest_flushed_log_index, 0); + ASSERT_EQ(smallest_flushed_seqno, 0); + ASSERT_EQ(smallest_applied_log_index, 10); + + auto size = rocksdb->GetCollector().GetSize(); + ASSERT_EQ(size, 30); + + auto is_pending_flush = rocksdb->GetCollector().IsFlushPending(); + ASSERT_TRUE(is_pending_flush); + } + + { + // type kv kv hash hash hash + // entry [1:1] -> ... [10:10] -> [11:11] -> [12:13] -> ... -> [30:49] + auto cur_par = rocksdb->GetCollector().GetList().begin(); + auto logindex = 1; + auto seq = 1; + for (int i = 1; i <= 10; i++) { + ASSERT_EQ(cur_par->GetAppliedLogIndex(), logindex); + ASSERT_EQ(cur_par->GetSequenceNumber(), seq); + cur_par = std::next(cur_par); + logindex++; + seq++; + } + + for (int i = 11; i <= 30; i++) { + ASSERT_EQ(cur_par->GetAppliedLogIndex(), logindex); + ASSERT_EQ(cur_par->GetSequenceNumber(), seq); + seq += 2; + logindex++; + cur_par = std::next(cur_par); + } + } + + { + // type kv kv hash hash hash + // entry [1:1] -> ... [10:10] -> [11:11] -> [12:13] -> ... -> [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 0 0 10 10 + // 1 0 0 30 49 + // 2 0 0 30 50 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 0 0 + + auto gap = rocksdb->GetLogIndexOfColumnFamilies().GetPendingFlushGap(); + ASSERT_EQ(gap, 30); + flush_cf(1); + sleep(5); // sleep flush complete. + // 1) 根据 cf 1 的 latest SequenceNumber = 49 查到对应的 log index 为 30. 设置 cf 1 的 flushed_log_index 和 + // flushed_sequence_number 为 30 49. + // + // type kv kv hash hash hash + // entry [1:1] -> ... [10:10] -> [11:11] -> [12:13] -> ... -> [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 0 0 10 10 + // 1 30 49 30 49 + // 2 0 0 30 50 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 0 0 + + // 2) 查找到此时的 smallest_applied_log_index_cf = 0 smallest_applied_log_index = 10 + // smallest_flushed_log_index_cf = 0 + // smallest_flushed_log_index = 0 smallest_flushed_seqno = 0 + // 根据 smallest_applied_log_index = 10 在队列长度 >= 2 的前提下, 持续删除 log_index < 10 的条目. + // + // type kv hash hash hash + // entry [10:10] -> [11:11] -> [12:13] -> ... -> [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 0 0 10 10 + // 1 30 49 30 49 + // 2 0 0 30 50 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 0 0 + + // 3) 根据 smallest_flushed_log_index_cf = 0 smallest_flushed_log_index = 0 smallest_flushed_seqno = 0 + // 设置 last_flush_index 为 0, 0 + // + // type kv hash hash hash + // entry [10:10] -> [11:11] -> [12:13] -> ... -> [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 0 0 10 10 + // 1 30 49 30 49 + // 2 0 0 30 50 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 0 0 + + // 4) 检测到队列中 logindex 的最大差值超过阈值, 触发 smallest_flushed_log_index_cf flush . 该 case 中对应 cf 为 0. + // 根据 cf 0 的 latest SequenceNumber = 10 查到对应的 log index 为 10. 设置 cf 0 的 flushed_log_index 和 + // flushed_sequence_number 为 10 10. + // + // type kv hash hash hash + // entry [10:10] -> [11:11] -> [12:13] -> ... -> [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 10 10 10 10 + // 1 30 49 30 49 + // 2 0 0 30 50 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 0 0 + + // 5) 查找到此时的 smallest_applied_log_index_cf = 0 smallest_applied_log_index = 10 + // smallest_flushed_log_index_cf = 2 smallest_flushed_log_index = 0 smallest_flushed_seqno = 0 + // 根据 smallest_applied_log_index = 10 在队列长度 >= 2 的前提下, 删除 log_index < 10 的条目, 不变. + // + // type kv hash hash hash + // entry [10:10] -> [11:11] -> [12:13] -> ... -> [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 10 10 10 10 + // 1 30 49 30 49 + // 2 0 0 30 50 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 0 0 + + // 6) 检测到队列中 logindex 的最大差值超过阈值, 触发 smallest_flushed_log_index_cf flush . 该 case 中对应 cf 为 2. + // 根据 cf 2 的 latest SequenceNumber = 50 查到对应的 log index 为 30. 设置 cf 2 的 flushed_log_index 和 + // flushed_sequence_number 为 30 50. + // + // type kv hash hash hash + // entry [10:10] -> [11:11] -> [12:13] -> ... -> [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 10 10 10 10 + // 1 30 49 30 49 + // 2 30 50 30 50 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 0 0 + + // 7) 查找到此时的 smallest_applied_log_index_cf = 2 smallest_applied_log_index = 30 + // smallest_flushed_log_index_cf = 2 smallest_flushed_log_index = 30 smallest_flushed_seqno = 50 + // 根据 smallest_applied_log_index = 30 在队列长度 >= 2 的前提下, 删除 log_index < 50 的条目. + // + // type hash + // entry [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 10 10 10 10 + // 1 30 49 30 49 + // 2 30 50 30 50 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 0 0 + + // 8) 根据 smallest_flushed_log_index_cf = 2 smallest_flushed_log_index = 30 smallest_flushed_seqno = 50 + // 设置 last_flush_index 为 30, 50. + // + // type hash + // entry [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 10 10 10 10 + // 1 30 49 30 49 + // 2 30 50 30 50 + // other 0 0 0 0 + // + // last_flush_index log_index sequencenumber + // 30 50 + + // 9) 当设置 last_flush_index 为 30, 50 时, 会同时拉高没有数据的 cf 的 flushed_index, 该 case 为 cf 0, cf 1, + // 将 cf 0 的 flushed_index 从 10 10 提高为 30 50. + // 将 cf 1 的 flushed index 从 30 49 提升到 30 50. + // 其他没有写入的 cf flushed index 从 0 0 提升到 30 50. + // + // type hash + // entry [30:49] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 30 50 10 10 + // 1 30 50 30 49 + // 2 30 50 30 50 + // other 30 50 0 0 + // + // last_flush_index log_index sequencenumber + // 30 50 + + // 9) 检测到队列长度未超过阈值, 结束 flush. + auto after_flush_size = rocksdb->GetCollector().GetSize(); + ASSERT_EQ(after_flush_size, 1); + + auto& cf_0_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kStringsCF); + ASSERT_EQ(cf_0_status.flushed_index.log_index, 30); + ASSERT_EQ(cf_0_status.flushed_index.seqno, 50); + ASSERT_EQ(cf_0_status.applied_index.log_index, 10); + ASSERT_EQ(cf_0_status.applied_index.seqno, 10); + + auto& cf_1_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kHashesMetaCF); + ASSERT_EQ(cf_1_status.flushed_index.log_index, 30); + ASSERT_EQ(cf_1_status.flushed_index.seqno, 50); + ASSERT_EQ(cf_1_status.applied_index.log_index, 30); + ASSERT_EQ(cf_1_status.applied_index.seqno, 49); + + auto& cf_2_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kHashesDataCF); + ASSERT_EQ(cf_2_status.flushed_index.log_index, 30); + ASSERT_EQ(cf_2_status.flushed_index.seqno, 50); + ASSERT_EQ(cf_2_status.applied_index.log_index, 30); + ASSERT_EQ(cf_2_status.applied_index.seqno, 50); + + auto& last_flush_index = rocksdb->GetLogIndexOfColumnFamilies().GetLastFlushIndex(); + ASSERT_EQ(last_flush_index.log_index.load(), 30); + ASSERT_EQ(last_flush_index.seqno.load(), 50); + + auto& cf_3_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kSetsMetaCF); + ASSERT_EQ(cf_3_status.flushed_index.log_index, 30); + ASSERT_EQ(cf_3_status.flushed_index.seqno, 50); + ASSERT_EQ(cf_3_status.applied_index.log_index, 0); + ASSERT_EQ(cf_3_status.applied_index.seqno, 0); + } + + { + add_kvs(30, 35); + // type hash -> kv -> ... -> kv + // entry [30:49] [31:51] [35:55] + // + // cf flushed_log_index flushed_sequence_number applied_log_index applied_sequence_number + // 0 30 50 35 55 + // 1 30 50 30 49 + // 2 30 50 30 50 + // other 30 50 0 0 + // + // last_flush_index log_index sequencenumber + // 30 50 + auto& last_flush_index = rocksdb->GetLogIndexOfColumnFamilies().GetLastFlushIndex(); + ASSERT_EQ(last_flush_index.log_index.load(), 30); + ASSERT_EQ(last_flush_index.seqno.load(), 50); + + auto& cf_0_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kStringsCF); + ASSERT_EQ(cf_0_status.flushed_index.log_index, 30); + ASSERT_EQ(cf_0_status.flushed_index.seqno, 50); + ASSERT_EQ(cf_0_status.applied_index.log_index, 35); + ASSERT_EQ(cf_0_status.applied_index.seqno, 55); + + auto& cf_1_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kHashesMetaCF); + ASSERT_EQ(cf_1_status.flushed_index.log_index, 30); + ASSERT_EQ(cf_1_status.flushed_index.seqno, 50); + ASSERT_EQ(cf_1_status.applied_index.log_index, 30); + ASSERT_EQ(cf_1_status.applied_index.seqno, 49); + + auto& cf_2_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kHashesDataCF); + ASSERT_EQ(cf_2_status.flushed_index.log_index, 30); + ASSERT_EQ(cf_2_status.flushed_index.seqno, 50); + ASSERT_EQ(cf_2_status.applied_index.log_index, 30); + ASSERT_EQ(cf_2_status.applied_index.seqno, 50); + + auto& cf_3_status = rocksdb->GetLogIndexOfColumnFamilies().GetCFStatus(storage::kSetsMetaCF); + ASSERT_EQ(cf_3_status.flushed_index.log_index, 30); + ASSERT_EQ(cf_3_status.flushed_index.seqno, 50); + ASSERT_EQ(cf_3_status.applied_index.log_index, 0); + ASSERT_EQ(cf_3_status.applied_index.seqno, 0); + + auto [smallest_applied_log_index_cf, smallest_applied_log_index, smallest_flushed_log_index_cf, + smallest_flushed_log_index, smallest_flushed_seqno] = + rocksdb->GetLogIndexOfColumnFamilies().GetSmallestLogIndex(-1); + + // 除了 cf 0 之外, 其余的 cf 都没有未持久化数据, 所以不在我们统计范围之内. + ASSERT_EQ(smallest_applied_log_index_cf, 0); + ASSERT_EQ(smallest_applied_log_index, 35); + + ASSERT_EQ(smallest_flushed_log_index_cf, 0); + ASSERT_EQ(smallest_flushed_log_index, 30); + ASSERT_EQ(smallest_flushed_seqno, 50); + + auto size = rocksdb->GetCollector().GetSize(); + ASSERT_EQ(size, 6); + + auto is_pending_flush = rocksdb->GetCollector().IsFlushPending(); + ASSERT_TRUE(!is_pending_flush); + } +}; diff --git a/src/storage/tests/log_index_collector_test.cc b/src/storage/tests/log_index_collector_test.cc new file mode 100644 index 000000000..6646b5447 --- /dev/null +++ b/src/storage/tests/log_index_collector_test.cc @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#include +#include +#include +#include +#include + +#include "fmt/core.h" +#include "gtest/gtest.h" + +#include "src/log_index.h" + +using namespace storage; // NOLINT + +template +class NumberCreator { + public: + explicit NumberCreator(T start = 0) : next_num_(start) {} + auto Next() -> T { return next_num_.fetch_add(STEP); } + + private: + std::atomic next_num_; +}; +using SequenceNumberCreator = NumberCreator; +using LogIndexCreator = NumberCreator; + +TEST(LogIndexAndSequenceCollectorTest, OneStepTest) { // NOLINT + LogIndexAndSequenceCollector collector; + SequenceNumberCreator seqno_creator(100); + LogIndexCreator logidx_creator(4); + for (int i = 0; i < 100; i++) { + collector.Update(logidx_creator.Next(), seqno_creator.Next()); + } + + // the target seqno is smaller than the smallest seqno in the list, should return 0 + for (rocksdb::SequenceNumber seq = 0; seq < 100; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 0); + } + // the target seqno is in the list' range, should return the correct idx + for (rocksdb::SequenceNumber seq = 100; seq < 300; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), (seq - 100) / 2 + 4); + } + // the target seqno is larger than the largest seqno in the list, should return the largest idx + for (rocksdb::SequenceNumber seq = 300; seq < 400; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 103); + } + + // if smallest flushed log index is 44 whose seqno is 180,181 + collector.Purge(44); + for (rocksdb::SequenceNumber seq = 0; seq < 180; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 0); + } + for (rocksdb::SequenceNumber seq = 180; seq < 300; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), (seq - 100) / 2 + 4); + } + for (rocksdb::SequenceNumber seq = 300; seq < 400; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 103); + } + collector.Purge(46); // should remove log44 and log55 + for (rocksdb::SequenceNumber seq = 0; seq < 184; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 0); + } + for (rocksdb::SequenceNumber seq = 184; seq < 300; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), (seq - 100) / 2 + 4); + } + for (rocksdb::SequenceNumber seq = 300; seq < 400; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 103); + } +} + +TEST(LogIndexAndSequenceCollectorTest, MutiStepTest) { // NOLINT + SequenceNumberCreator seqno_creator(100); + LogIndexCreator logidx_creator(4); + LogIndexAndSequenceCollector collector(2); // update only when log index is multiple of 4 + for (int i = 0; i < 100; i++) { + collector.Update(logidx_creator.Next(), seqno_creator.Next()); + } + + // the target seqno is smaller than the smallest seqno in the list, should return 0 + for (rocksdb::SequenceNumber seq = 0; seq < 100; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 0); + } + // the target seqno is in the list' range, should return the correct idx + for (rocksdb::SequenceNumber seq = 100; seq < 300; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), (seq - 100) / 8 * 4 + 4); + } + // the target seqno is larger than the largest seqno in the list, should return the largest idx + for (rocksdb::SequenceNumber seq = 300; seq < 400; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 100); + } + + // if smallest flushed log index is 44 whose seqno is 180,181 + collector.Purge(44); + for (rocksdb::SequenceNumber seq = 0; seq < 180; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 0); + } + for (rocksdb::SequenceNumber seq = 180; seq < 300; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), (seq - 100) / 8 * 4 + 4); + } + for (rocksdb::SequenceNumber seq = 300; seq < 400; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 100); + } + collector.Purge(45); // should do nothing + for (rocksdb::SequenceNumber seq = 0; seq < 180; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 0); + } + for (rocksdb::SequenceNumber seq = 180; seq < 300; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), (seq - 100) / 8 * 4 + 4); + } + for (rocksdb::SequenceNumber seq = 300; seq < 400; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 100); + } + collector.Purge(49); // should remove the log44 + for (rocksdb::SequenceNumber seq = 0; seq < 188; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 0); + } + for (rocksdb::SequenceNumber seq = 188; seq < 300; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), (seq - 100) / 8 * 4 + 4); + } + for (rocksdb::SequenceNumber seq = 300; seq < 400; seq++) { + EXPECT_EQ(collector.FindAppliedLogIndex(seq), 100); + } +} + +struct TimerGuard { + TimerGuard(std::string_view name = "Test") : name_(name), start_(std::chrono::system_clock::now()) {} + ~TimerGuard() { + auto end = std::chrono::system_clock::now(); + auto duration = std::chrono::duration_cast(end - start_); + fmt::println("{} cost {}ms", name_, duration.count()); + } + + std::string_view name_; + std::chrono::time_point start_; +}; + +TEST(LogIndexAndSequenceCollectorTest, FindBenchmark) { + LogIndexAndSequenceCollector collector; + SequenceNumberCreator seq_creator(1); + LogIndexCreator log_creator(4); + size_t size = 0; + { + for (; size < 100; size++) { + collector.Update(log_creator.Next(), seq_creator.Next()); + } + // There are 100 pair in the collector: 1:4, 3:5, 5:6, 7:7, 9:8,..., 199:103 + constexpr int kFindTimes = 100; + TimerGuard timer("100 size test"); + for (int i = 0; i < kFindTimes; i++) { + for (int n = 1; n <= 200; n++) { + auto res = collector.FindAppliedLogIndex(n); + ASSERT_EQ(res, (n - 1) / 2 + 4); + } + } + } + { + for (; size < 1000; size++) { + collector.Update(log_creator.Next(), seq_creator.Next()); + } + // There are 1000 pair in the collector: 1:4, 3:5, 5:6, 7:7, 9:8,..., 1999:1003 + constexpr int kFindTimes = 100; + TimerGuard timer("1000 size test"); + for (int i = 0; i < kFindTimes; i++) { + for (int n = 1; n <= 2000; n++) { + auto res = collector.FindAppliedLogIndex(n); + ASSERT_EQ(res, (n - 1) / 2 + 4); + } + } + } +} diff --git a/src/storage/tests/log_index_test.cc b/src/storage/tests/log_index_test.cc new file mode 100644 index 000000000..54e656979 --- /dev/null +++ b/src/storage/tests/log_index_test.cc @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#include +#include +#include +#include +#include +#include + +#include "fmt/core.h" +#include "gtest/gtest.h" +#include "rocksdb/db.h" +#include "rocksdb/listener.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" + +#include "pstd/log.h" +#include "pstd/thread_pool.h" +#include "src/log_index.h" +#include "src/redis.h" +#include "storage/storage.h" +#include "storage/util.h" + +using namespace storage; // NOLINT + +class LogIniter { + public: + LogIniter() { + logger::Init("./log_index_test.log"); + spdlog::set_level(spdlog::level::info); + } +}; +static LogIniter initer; + +TEST(TablePropertyTest, SimpleTest) { + constexpr const char* kDbPath = "./log_index_test_db"; + rocksdb::Options options; + options.create_if_missing = true; + LogIndexAndSequenceCollector collector; + options.table_properties_collector_factories.push_back( + std::make_shared(collector)); + rocksdb::DB* db{nullptr}; + auto s = rocksdb::DB::Open(options, kDbPath, &db); + EXPECT_TRUE(s.ok()); + + std::string key = "table-property-test"; + s = db->Put(rocksdb::WriteOptions(), key, key); + EXPECT_TRUE(s.ok()); + std::string res; + s = db->Get(rocksdb::ReadOptions(), key, &res); + EXPECT_TRUE(s.ok()); + EXPECT_EQ(key, res); + collector.Update(233333, db->GetLatestSequenceNumber()); + db->Flush(rocksdb::FlushOptions()); + + rocksdb::TablePropertiesCollection properties; + s = db->GetPropertiesOfAllTables(&properties); + EXPECT_TRUE(s.ok()); + EXPECT_TRUE(properties.size() == 1); + for (auto& [name, prop] : properties) { + const auto& collector = prop->user_collected_properties; + auto it = collector.find(static_cast(LogIndexTablePropertiesCollector::kPropertyName)); + EXPECT_NE(it, collector.cend()); + EXPECT_EQ(it->second, "233333/" + std::to_string(db->GetLatestSequenceNumber())); + } + + db->Close(); + DeleteFiles(kDbPath); +} + +class LogQueue : public pstd::noncopyable { + public: + using WriteCallback = std::function; + + explicit LogQueue(WriteCallback&& cb) : write_cb_(std::move(cb)) { consumer_.SetMaxIdleThread(1); } + + void AppendLog(const pikiwidb::Binlog& log, std::promise&& promise) { + auto task = [&] { + auto idx = next_log_idx_.fetch_add(1); + auto s = write_cb_(log, idx); + promise.set_value(s); + }; + consumer_.ExecuteTask(std::move(task)); + } + + private: + WriteCallback write_cb_ = nullptr; + pstd::ThreadPool consumer_; + std::atomic next_log_idx_{1}; +}; + +class LogIndexTest : public ::testing::Test { + public: + LogIndexTest() + : log_queue_([this](const pikiwidb::Binlog& log, LogIndex log_idx) { return db_.OnBinlogWrite(log, log_idx); }) { + options_.options.create_if_missing = true; + options_.db_instance_num = 1; + options_.raft_timeout_s = 10000; + options_.append_log_function = [this](const pikiwidb::Binlog& log, std::promise&& promise) { + log_queue_.AppendLog(log, std::move(promise)); + }; + options_.do_snapshot_function = [](int64_t log_index, bool sync) {}; + } + ~LogIndexTest() override { DeleteFiles(db_path_.c_str()); } + + void SetUp() override { + if (access(db_path_.c_str(), F_OK) == 0) { + std::filesystem::remove_all(db_path_.c_str()); + } + mkdir(db_path_.c_str(), 0755); + auto s = db_.Open(options_, db_path_); + ASSERT_TRUE(s.ok()); + } + + std::string db_path_{"./test_db/log_index_test"}; + StorageOptions options_; + Storage db_; + uint32_t test_times_ = 100; + std::string key_ = "log-index-test"; + std::string field_prefix_ = "field"; + std::string value_prefix_ = "value"; + rocksdb::WriteOptions write_options_; + rocksdb::ReadOptions read_options_; + LogQueue log_queue_; + + auto CreateRandomKey(int i, size_t length) -> std::string { + auto res = CreateRandomFieldValue(i, length); + res.append(key_); + return res; + } + static auto CreateRandomFieldValue(int i, size_t length) -> std::string { + std::mt19937 gen(i); + std::string str(length, 0); + for (int i = 0; i < length; i++) { + str[i] = chars[gen() % (sizeof(chars) / sizeof(char))]; + } + return str; + } + constexpr static char chars[] = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', + 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}; +}; + +TEST_F(LogIndexTest, DoNothing) {} + +TEST_F(LogIndexTest, SimpleTest) { // NOLINT + auto& redis = db_.GetDBInstance(key_); + auto add_kvs = [&](int start, int end) { + for (int i = start; i < end; i++) { + auto key = CreateRandomKey(i, 256); + auto fv = CreateRandomFieldValue(i, 512); + int32_t res{}; + auto s = redis->HSet(key, fv, fv, &res); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(1, res); + + std::string get_res; + s = redis->HGet(key, fv, &get_res); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(fv, get_res); + } + }; + auto flushdb = [&]() { + auto s = redis->GetDB()->Flush(rocksdb::FlushOptions(), redis->GetColumnFamilyHandles()[kHashesMetaCF]); + ASSERT_TRUE(s.ok()); + s = redis->GetDB()->Flush(rocksdb::FlushOptions(), redis->GetColumnFamilyHandles()[kHashesDataCF]); + ASSERT_TRUE(s.ok()); + }; + + // one key test + { + add_kvs(0, 1); + flushdb(); + + rocksdb::TablePropertiesCollection properties; + auto s = redis->GetDB()->GetPropertiesOfAllTables(redis->GetColumnFamilyHandles()[kHashesMetaCF], &properties); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(properties.size() == 1); + auto res = LogIndexTablePropertiesCollector::GetLargestLogIndexFromTableCollection(properties); + EXPECT_TRUE(res.has_value()); + assert(res.has_value()); + EXPECT_EQ(res->GetAppliedLogIndex(), 1); + EXPECT_EQ(res->GetSequenceNumber(), 1); + + properties.clear(); + s = redis->GetDB()->GetPropertiesOfAllTables(redis->GetColumnFamilyHandles()[kHashesDataCF], &properties); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(properties.size() == 1); + res = LogIndexTablePropertiesCollector::GetLargestLogIndexFromTableCollection(properties); + EXPECT_TRUE(res.has_value()); + assert(res.has_value()); + EXPECT_EQ(res->GetAppliedLogIndex(), 1); + EXPECT_EQ(res->GetSequenceNumber(), 2); + } + + // more keys + { + add_kvs(1, 10000); + flushdb(); + + rocksdb::TablePropertiesCollection properties; + auto s = redis->GetDB()->GetPropertiesOfAllTables(redis->GetColumnFamilyHandles()[kHashesMetaCF], &properties); + ASSERT_TRUE(s.ok()); + auto res = LogIndexTablePropertiesCollector::GetLargestLogIndexFromTableCollection(properties); + EXPECT_TRUE(res.has_value()); + assert(res.has_value()); + EXPECT_EQ(res->GetAppliedLogIndex(), 10000); + EXPECT_EQ(res->GetSequenceNumber(), 19999); + + properties.clear(); + s = redis->GetDB()->GetPropertiesOfAllTables(redis->GetColumnFamilyHandles()[kHashesDataCF], &properties); + ASSERT_TRUE(s.ok()); + res = LogIndexTablePropertiesCollector::GetLargestLogIndexFromTableCollection(properties); + EXPECT_TRUE(res.has_value()); + assert(res.has_value()); + EXPECT_EQ(res->GetAppliedLogIndex(), 10000); + EXPECT_EQ(res->GetSequenceNumber(), 20000); + } + + // more flush + { + for (int i = 1; i < 20; i++) { + fmt::println("==================i={} start==========================", i); + auto start = i * 10000; + auto end = start + 10000; + + add_kvs(start, end); + flushdb(); + // sleep(1); + + { + rocksdb::TablePropertiesCollection properties; + auto s = redis->GetDB()->GetPropertiesOfAllTables(redis->GetColumnFamilyHandles()[kHashesMetaCF], &properties); + s = redis->GetDB()->GetPropertiesOfAllTables(redis->GetColumnFamilyHandles()[kHashesDataCF], &properties); + std::vector metas; + redis->GetDB()->GetLiveFilesMetaData(&metas); + for (const auto& meta : metas) { + auto file = meta.directory + meta.name; + if (!properties.contains(file)) { + fmt::println("{}: L{}, {}, not contains", file, meta.level, meta.column_family_name); + continue; + } + auto res = LogIndexTablePropertiesCollector::ReadStatsFromTableProps(properties.at(file)); + assert(res.has_value()); + fmt::println("{}: L{}, {}, logidx={}", file, meta.level, meta.column_family_name, res->GetAppliedLogIndex()); + } + } + + rocksdb::TablePropertiesCollection properties; + auto s = redis->GetDB()->GetPropertiesOfAllTables(redis->GetColumnFamilyHandles()[kHashesMetaCF], &properties); + ASSERT_TRUE(s.ok()); + auto res = LogIndexTablePropertiesCollector::GetLargestLogIndexFromTableCollection(properties); + EXPECT_TRUE(res.has_value()); + assert(res.has_value()); + EXPECT_EQ(res->GetAppliedLogIndex(), end); + EXPECT_EQ(res->GetSequenceNumber(), end * 2 - 1); + + properties.clear(); + s = redis->GetDB()->GetPropertiesOfAllTables(redis->GetColumnFamilyHandles()[kHashesDataCF], &properties); + ASSERT_TRUE(s.ok()); + res = LogIndexTablePropertiesCollector::GetLargestLogIndexFromTableCollection(properties); + EXPECT_TRUE(res.has_value()); + assert(res.has_value()); + EXPECT_EQ(res->GetAppliedLogIndex(), end); + EXPECT_EQ(res->GetSequenceNumber(), end * 2); + } + } +} diff --git a/src/store.cc b/src/store.cc index 22cf42001..a8186793b 100644 --- a/src/store.cc +++ b/src/store.cc @@ -5,12 +5,15 @@ * of patent rights can be found in the PATENTS file in the same directory. */ +#include "store.h" + +#include #include #include "config.h" #include "db.h" -#include "log.h" -#include "store.h" +#include "pstd/log.h" +#include "pstd/pstd_string.h" namespace pikiwidb { @@ -19,9 +22,10 @@ PStore& PStore::Instance() { return store; } -void PStore::Init(int dbNum) { - backends_.reserve(dbNum); - for (int i = 0; i < dbNum; i++) { +void PStore::Init(int db_number) { + db_number_ = db_number; + backends_.reserve(db_number_); + for (int i = 0; i < db_number_; i++) { auto db = std::make_unique(i, g_config.db_path); backends_.push_back(std::move(db)); INFO("Open DB_{} success!", i); @@ -29,4 +33,41 @@ void PStore::Init(int dbNum) { INFO("STORE Init success!"); } +void PStore::HandleTaskSpecificDB(const TasksVector& tasks) { + std::for_each(tasks.begin(), tasks.end(), [this](const auto& task) { + if (task.db < 0 || task.db >= db_number_) { + WARN("The database index is out of range."); + return; + } + auto& db = backends_.at(task.db); + switch (task.type) { + case kCheckpoint: { + if (auto s = task.args.find(kCheckpointPath); s == task.args.end()) { + WARN("The critical parameter 'path' is missing for do a checkpoint."); + return; + } + auto path = task.args.find(kCheckpointPath)->second; + pstd::TrimSlash(path); + db->CreateCheckpoint(path, task.sync); + break; + } + case kLoadDBFromCheckpoint: { + if (auto s = task.args.find(kCheckpointPath); s == task.args.end()) { + WARN("The critical parameter 'path' is missing for load a checkpoint."); + return; + } + auto path = task.args.find(kCheckpointPath)->second; + pstd::TrimSlash(path); + db->LoadDBFromCheckpoint(path, task.sync); + break; + } + case kEmpty: { + WARN("A empty task was passed in, not doing anything."); + break; + } + default: + break; + } + }); +} } // namespace pikiwidb diff --git a/src/store.h b/src/store.h index d702517b0..8e8590adb 100644 --- a/src/store.h +++ b/src/store.h @@ -9,18 +9,36 @@ #define GLOG_NO_ABBREVIATED_SEVERITIES -#include "common.h" -#include "db.h" -#include "storage/storage.h" - #include -#include -#include #include #include +#include "common.h" +#include "db.h" +#include "storage/storage.h" + namespace pikiwidb { +enum TaskType { kCheckpoint = 0, kLoadDBFromCheckpoint, kEmpty }; + +enum TaskArg { + kCheckpointPath = 0, +}; + +struct TaskContext { + TaskType type = kEmpty; + int db = -1; + std::map args; + bool sync = false; + TaskContext() = delete; + TaskContext(TaskType t, bool s = false) : type(t), sync(s) {} + TaskContext(TaskType t, int d, bool s = false) : type(t), db(d), sync(s) {} + TaskContext(TaskType t, int d, const std::map& a, bool s = false) + : type(t), db(d), args(a), sync(s) {} +}; + +using TasksVector = std::vector; + class PStore { public: static PStore& Instance(); @@ -28,21 +46,17 @@ class PStore { PStore(const PStore&) = delete; void operator=(const PStore&) = delete; - void Init(int dbNum); + void Init(int db_number); std::unique_ptr& GetBackend(int32_t index) { return backends_[index]; }; - std::shared_mutex& SharedMutex() { return dbs_mutex_; } + void HandleTaskSpecificDB(const TasksVector& tasks); + + int GetDBNumber() const { return db_number_; } private: PStore() = default; - - /** - * If you want to access all the DBs at the same time, - * then you must hold the lock. - * For example: you want to execute flushall or bgsave. - */ - std::shared_mutex dbs_mutex_; + int db_number_ = 0; std::vector> backends_; }; diff --git a/tests/consistency_test.go b/tests/consistency_test.go new file mode 100644 index 000000000..a2fd0d02f --- /dev/null +++ b/tests/consistency_test.go @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +package pikiwidb_test + +import ( + "bufio" + "context" + "log" + "os/exec" + "strconv" + "strings" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/redis/go-redis/v9" + + "github.com/OpenAtomFoundation/pikiwidb/tests/util" +) + +var ( + followers []*redis.Client + leader *redis.Client +) + +var _ = Describe("Consistency", Ordered, func() { + var ( + ctx = context.TODO() + servers []*util.Server + ) + + BeforeAll(func() { + cmd := exec.Command("ulimit", "-n", "999999") + _ = cmd.Run() + for i := 0; i < 3; i++ { + config := util.GetConfPath(false, int64(i)) + s := util.StartServer(config, map[string]string{"port": strconv.Itoa(12000 + (i+1)*111), + "use-raft": "yes"}, true) + Expect(s).NotTo(BeNil()) + servers = append(servers, s) + + if i == 0 { + leader = s.NewClient() + Expect(leader).NotTo(BeNil()) + Expect(leader.FlushDB(ctx).Err()).NotTo(HaveOccurred()) + } else { + c := s.NewClient() + Expect(c).NotTo(BeNil()) + Expect(c.FlushDB(ctx).Err()).NotTo(HaveOccurred()) + followers = append(followers, c) + } + } + + res, err := leader.Do(ctx, "RAFT.CLUSTER", "INIT").Result() + Expect(err).NotTo(HaveOccurred()) + msg, ok := res.(string) + Expect(ok).To(BeTrue()) + Expect(msg).To(Equal("OK")) + err = leader.Close() + Expect(err).NotTo(HaveOccurred()) + leader = nil + + for _, f := range followers { + res, err := f.Do(ctx, "RAFT.CLUSTER", "JOIN", "127.0.0.1:12111").Result() + Expect(err).NotTo(HaveOccurred()) + msg, ok := res.(string) + Expect(ok).To(BeTrue()) + Expect(msg).To(Equal("OK")) + err = f.Close() + Expect(err).NotTo(HaveOccurred()) + } + followers = nil + }) + + AfterAll(func() { + for _, s := range servers { + err := s.Close() + if err != nil { + log.Println("Close Server fail.", err.Error()) + return + } + } + }) + + BeforeEach(func() { + for i, s := range servers { + if i == 0 { + leader = s.NewClient() + Expect(leader).NotTo(BeNil()) + Expect(leader.FlushDB(ctx).Err()).NotTo(HaveOccurred()) + } else { + c := s.NewClient() + Expect(c).NotTo(BeNil()) + //Expect(c.FlushDB(ctx).Err().Error()).To(Equal("ERR MOVED 127.0.0.1:12111")) + followers = append(followers, c) + } + } + }) + + AfterEach(func() { + err := leader.Close() + Expect(err).NotTo(HaveOccurred()) + leader = nil + + for _, f := range followers { + err = f.Close() + Expect(err).NotTo(HaveOccurred()) + } + followers = nil + }) + + It("HSet & HDel Consistency Test", func() { + const testKey = "HashConsistencyTest" + testValue := map[string]string{ + "fa": "va", + "fb": "vb", + "fc": "vc", + } + { + // hset write on leader + set, err := leader.HSet(ctx, testKey, testValue).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(set).To(Equal(int64(3))) + + // read check + readChecker(func(c *redis.Client) { + getall, err := c.HGetAll(ctx, testKey).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(getall).To(Equal(testValue)) + }) + } + + { + // hdel write on leader + del, err := leader.HDel(ctx, testKey, "fb").Result() + Expect(err).NotTo(HaveOccurred()) + Expect(del).To(Equal(int64(1))) + + // read check + readChecker(func(c *redis.Client) { + getall, err := c.HGetAll(ctx, testKey).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(getall).To(Equal(map[string]string{ + "fa": "va", + "fc": "vc", + })) + }) + } + }) + + It("SAdd & SRem Consistency Test", func() { + const testKey = "SetsConsistencyTestKey" + testValues := []string{"sa", "sb", "sc", "sd"} + + { + // sadd write on leader + sadd, err := leader.SAdd(ctx, testKey, testValues).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(sadd).To(Equal(int64(len(testValues)))) + + // read check + readChecker(func(c *redis.Client) { + smembers, err := c.SMembers(ctx, testKey).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(smembers).To(Equal(testValues)) + }) + } + + { + // srem write on leader + srem, err := leader.SRem(ctx, testKey, []string{"sb", "sd"}).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(srem).To(Equal(int64(2))) + + // read check + readChecker(func(c *redis.Client) { + smembers, err := c.SMembers(ctx, testKey).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(smembers).To(Equal([]string{"sa", "sc"})) + }) + } + }) + + It("LPush & LPop Consistency Test", func() { + const testKey = "ListsConsistencyTestKey" + testValues := []string{"la", "lb", "lc", "ld"} + + { + // lpush write on leader + lpush, err := leader.LPush(ctx, testKey, testValues).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(lpush).To(Equal(int64(len(testValues)))) + + // read check + readChecker(func(c *redis.Client) { + lrange, err := c.LRange(ctx, testKey, 0, 10).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(lrange).To(Equal(reverse(testValues))) + }) + } + + { + // lpop write on leader + lpop, err := leader.LPop(ctx, testKey).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(lpop).To(Equal("ld")) + lpop, err = leader.LPop(ctx, testKey).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(lpop).To(Equal("lc")) + + // read check + readChecker(func(c *redis.Client) { + lrange, err := c.LRange(ctx, testKey, 0, 10).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(lrange).To(Equal([]string{"lb", "la"})) + }) + } + }) + + It("ZAdd Consistency Test", func() { + const testKey = "ZSetsConsistencyTestKey" + testData := []redis.Z{ + {Score: 4, Member: "z4"}, + {Score: 8, Member: "z8"}, + {Score: 5, Member: "z5"}, + } + expectData := []redis.Z{ + {Score: 8, Member: "z8"}, + {Score: 5, Member: "z5"}, + {Score: 4, Member: "z4"}, + } + { + // zadd write on leader + zadd, err := leader.ZAdd(ctx, testKey, testData...).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(zadd).To(Equal(int64(len(testData)))) + + // read check + readChecker(func(c *redis.Client) { + zrange, err := c.ZRevRangeWithScores(ctx, testKey, 0, -1).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(zrange).To(Equal(expectData)) + }) + } + }) + + It("Set Consistency Test", func() { + const testKey = "StringsConsistencyTestKey" + const testValue = "StringsConsistencyTestKey" + { + // set write on leader + set, err := leader.Set(ctx, testKey, testValue, 0).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(set).To(Equal("OK")) + + // read check + readChecker(func(c *redis.Client) { + get, err := c.Get(ctx, testKey).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(get).To(Equal(testValue)) + }) + } + }) + + It("ThreeNodesClusterConstructionTest", func() { + for _, follower := range followers { + info, err := follower.Do(ctx, "info", "raft").Result() + Expect(err).NotTo(HaveOccurred()) + info_str := info.(string) + scanner := bufio.NewScanner(strings.NewReader(info_str)) + var peer_id string + var is_member bool + for scanner.Scan() { + line := scanner.Text() + if strings.Contains(line, "raft_peer_id") { + parts := strings.Split(line, ":") + if len(parts) >= 2 { + peer_id = parts[1] + is_member = true + break + } + } + } + + if is_member { + ret, err := follower.Do(ctx, "raft.node", "remove", peer_id).Result() + Expect(err).NotTo(HaveOccurred()) + Expect(ret).To(Equal(OK)) + } + } + }) + +}) + +func readChecker(check func(*redis.Client)) { + // read on leader + check(leader) + time.Sleep(10000 * time.Millisecond) + + // read on followers + followerChecker(followers, check) +} + +func followerChecker(fs []*redis.Client, check func(*redis.Client)) { + for _, f := range fs { + check(f) + } +} + +func reverse(src []string) []string { + a := make([]string, len(src)) + copy(a, src) + + for i := len(a)/2 - 1; i >= 0; i-- { + opp := len(a) - 1 - i + a[i], a[opp] = a[opp], a[i] + } + + return a +} diff --git a/tests/hash_test.go b/tests/hash_test.go index 7dffb6306..ca28c585f 100644 --- a/tests/hash_test.go +++ b/tests/hash_test.go @@ -12,7 +12,6 @@ import ( "log" "strconv" "time" - . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/redis/go-redis/v9" diff --git a/tests/util/pikiwidb.go b/tests/util/pikiwidb.go index 3d53aa48e..8c11c6ad8 100644 --- a/tests/util/pikiwidb.go +++ b/tests/util/pikiwidb.go @@ -62,9 +62,8 @@ func GetConfPath(copy bool, t int64) string { func checkCondition(c *redis.Client) bool { ctx := context.TODO() - //TODO(dingxiaoshuai) use Cmd PING - r, e := c.Set(ctx, "key", "value", 0).Result() - return r == "OK" && e == nil + _, err := c.Ping(ctx).Result() + return err == nil } type Server struct { @@ -168,20 +167,36 @@ func StartServer(config string, options map[string]string, delete bool) *Server } if runtime.GOOS == "darwin" { - cmd = exec.Command("sed", "-i", "", "s|db-path ./db|db-path "+d+"/db"+"|", n) + cmd = exec.Command("sed", "-i", "", "s|db-path ./db|db-path "+d+"/db|", n) } else { - cmd = exec.Command("sed", "-i", "s|db-path ./db|db-path "+d+"/db"+"|", n) + cmd = exec.Command("sed", "-i", "s|db-path ./db|db-path "+d+"/db|", n) } err = cmd.Run() if err != nil { log.Println("The configuration file cannot be used.", err.Error()) return nil } + value, is_exist := options["use-raft"] + if is_exist && value == "yes" { + if runtime.GOOS == "darwin" { + cmd = exec.Command("sed", "-i", "", "s|use-raft no|use-raft yes|", n) + } else { + cmd = exec.Command("sed", "-i", "s|use-raft no|use-raft yes|", n) + } + err = cmd.Run() + if err != nil { + log.Println("use-raft don't change success.", err.Error()) + return nil + } + } c.Args = append(c.Args, n) } for k, v := range options { + if k == "use-raft" { + continue + } c.Args = append(c.Args, fmt.Sprintf("--%s", k), v) }