cmake_minimum_required(VERSION 3.24) project(tgi-llama-cpp-backend VERSION 1.0.0) set(CMAKE_CXX_STANDARD 23) include(FetchContent) set(LLAMA_CPP_TARGET_VERSION "b3837" CACHE STRING "Version of llama.cpp to build against") set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE STRING "CUDA arch(s) to build") option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner") option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp") if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") message(STATUS "Targeting libc++") set(CMAKE_CXX_FLAGS -stdlib=libc++ ${CMAKE_CXX_FLAGS}) else () message(STATUS "Not using libc++ ${CMAKE_CXX_COMPILER_ID} ${CMAKE_SYSTEM_NAME}") endif () # Add dependencies include(cmake/numa.cmake) include(cmake/spdlog.cmake) if (${LLAMA_CPP_BUILD_CUDA}) message(STATUS "Enabling llama.cpp CUDA support") if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES ${LLAMA_CPP_TARGET_CUDA_ARCHS}) endif () set(GGML_CUDA ON) endif () # Download llama.cpp repo at the specific version fetchcontent_declare( llama URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4215.tar.gz ) fetchcontent_makeavailable(llama) add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11) target_link_libraries(tgi_llamacpp_backend_impl PUBLIC spdlog::spdlog llama) if (NUMA_FOUND) target_link_libraries(tgi_llamacpp_backend_impl PUBLIC numa) endif () install(TARGETS tgi_llamacpp_backend_impl spdlog llama) if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") target_compile_definitions(tgi_llamacpp_backend_impl PRIVATE TGI_LLAMACPP_BACKEND_DEBUG=1) endif () if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) message(STATUS "Building llama.cpp offline runner") add_executable(tgi_llamacpp_offline_runner offline/main.cpp) target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama spdlog::spdlog) endif ()