@@ -3269,151 +3269,3 @@ struct server_context {
32693269 };
32703270 }
32713271};
3272-
3273- static void common_params_handle_model_default (std::string &model, const std::string &model_url, std::string &hf_repo,
3274- std::string &hf_file, const std::string &hf_token) {
3275- if (!hf_repo.empty ()) {
3276- // short-hand to avoid specifying --hf-file -> default it to --model
3277- if (hf_file.empty ()) {
3278- if (model.empty ()) {
3279- auto auto_detected = common_get_hf_file (hf_repo, hf_token);
3280- if (auto_detected.first .empty () || auto_detected.second .empty ()) {
3281- exit (1 ); // built without CURL, error message already printed
3282- }
3283- hf_repo = auto_detected.first ;
3284- hf_file = auto_detected.second ;
3285- } else {
3286- hf_file = model;
3287- }
3288- }
3289- // make sure model path is present (for caching purposes)
3290- if (model.empty ()) {
3291- // this is to avoid different repo having same file name, or same file name in different subdirs
3292- std::string filename = hf_repo + " _" + hf_file;
3293- // to make sure we don't have any slashes in the filename
3294- string_replace_all (filename, " /" , " _" );
3295- model = fs_get_cache_file (filename);
3296- }
3297- } else if (!model_url.empty ()) {
3298- if (model.empty ()) {
3299- auto f = string_split<std::string>(model_url, ' #' ).front ();
3300- f = string_split<std::string>(f, ' ?' ).front ();
3301- model = fs_get_cache_file (string_split<std::string>(f, ' /' ).back ());
3302- }
3303- } else if (model.empty ()) {
3304- model = DEFAULT_MODEL_PATH;
3305- }
3306- }
3307-
3308- // parse the given jparams (see de.kherud.llama.args.ModelParameters#toString()) from JSON to the required C++ struct.
3309- static void server_params_parse (json jparams, common_params ¶ms) {
3310- common_params default_params;
3311-
3312- params.sampling .seed = json_value (jparams, " seed" , default_params.sampling .seed );
3313- params.cpuparams .n_threads = json_value (jparams, " n_threads" , default_params.cpuparams .n_threads );
3314- params.speculative .cpuparams .n_threads =
3315- json_value (jparams, " n_threads_draft" , default_params.speculative .cpuparams .n_threads );
3316- params.cpuparams_batch .n_threads = json_value (jparams, " n_threads_batch" , default_params.cpuparams_batch .n_threads );
3317- params.speculative .cpuparams_batch .n_threads =
3318- json_value (jparams, " n_threads_batch_draft" , default_params.speculative .cpuparams_batch .n_threads );
3319- params.n_predict = json_value (jparams, " n_predict" , default_params.n_predict );
3320- params.n_ctx = json_value (jparams, " n_ctx" , default_params.n_ctx );
3321- params.n_batch = json_value (jparams, " n_batch" , default_params.n_batch );
3322- params.n_ubatch = json_value (jparams, " n_ubatch" , default_params.n_ubatch );
3323- params.n_keep = json_value (jparams, " n_keep" , default_params.n_keep );
3324-
3325- params.speculative .n_max = json_value (jparams, " n_draft" , default_params.speculative .n_max );
3326- params.speculative .n_min = json_value (jparams, " n_draft_min" , default_params.speculative .n_min );
3327-
3328- params.n_chunks = json_value (jparams, " n_chunks" , default_params.n_chunks );
3329- params.n_parallel = json_value (jparams, " n_parallel" , default_params.n_parallel );
3330- params.n_sequences = json_value (jparams, " n_sequences" , default_params.n_sequences );
3331- params.speculative .p_split = json_value (jparams, " p_split" , default_params.speculative .p_split );
3332- params.grp_attn_n = json_value (jparams, " grp_attn_n" , default_params.grp_attn_n );
3333- params.grp_attn_w = json_value (jparams, " grp_attn_w" , default_params.grp_attn_w );
3334- params.n_print = json_value (jparams, " n_print" , default_params.n_print );
3335- params.rope_freq_base = json_value (jparams, " rope_freq_base" , default_params.rope_freq_base );
3336- params.rope_freq_scale = json_value (jparams, " rope_freq_scale" , default_params.rope_freq_scale );
3337- params.yarn_ext_factor = json_value (jparams, " yarn_ext_factor" , default_params.yarn_ext_factor );
3338- params.yarn_attn_factor = json_value (jparams, " yarn_attn_factor" , default_params.yarn_attn_factor );
3339- params.yarn_beta_fast = json_value (jparams, " yarn_beta_fast" , default_params.yarn_beta_fast );
3340- params.yarn_beta_slow = json_value (jparams, " yarn_beta_slow" , default_params.yarn_beta_slow );
3341- params.yarn_orig_ctx = json_value (jparams, " yarn_orig_ctx" , default_params.yarn_orig_ctx );
3342- params.defrag_thold = json_value (jparams, " defrag_thold" , default_params.defrag_thold );
3343- params.numa = json_value (jparams, " numa" , default_params.numa );
3344- params.rope_scaling_type = json_value (jparams, " rope_scaling_type" , default_params.rope_scaling_type );
3345- params.pooling_type = json_value (jparams, " pooling_type" , default_params.pooling_type );
3346- params.model = json_value (jparams, " model" , default_params.model );
3347- params.speculative .model = json_value (jparams, " model_draft" , default_params.speculative .model );
3348- params.model_alias = json_value (jparams, " model_alias" , default_params.model_alias );
3349- params.model_url = json_value (jparams, " model_url" , default_params.model_url );
3350- params.hf_repo = json_value (jparams, " hf_repo" , default_params.hf_repo );
3351- params.hf_file = json_value (jparams, " hf_file" , default_params.hf_file );
3352- params.prompt = json_value (jparams, " prompt" , default_params.prompt );
3353- params.prompt_file = json_value (jparams, " prompt_file" , default_params.prompt_file );
3354- params.path_prompt_cache = json_value (jparams, " path_prompt_cache" , default_params.path_prompt_cache );
3355- params.input_prefix = json_value (jparams, " input_prefix" , default_params.input_prefix );
3356- params.input_suffix = json_value (jparams, " input_suffix" , default_params.input_suffix );
3357- params.antiprompt = json_value (jparams, " antiprompt" , default_params.antiprompt );
3358- params.lookup_cache_static = json_value (jparams, " lookup_cache_static" , default_params.lookup_cache_static );
3359- params.lookup_cache_dynamic = json_value (jparams, " lookup_cache_dynamic" , default_params.lookup_cache_dynamic );
3360- params.logits_file = json_value (jparams, " logits_file" , default_params.logits_file );
3361- // params.lora_adapters = json_value(jparams, "lora_adapter", default_params.lora_adapters);
3362- params.embedding = json_value (jparams, " embedding" , default_params.embedding );
3363- params.escape = json_value (jparams, " escape" , default_params.escape );
3364- params.cont_batching = json_value (jparams, " cont_batching" , default_params.cont_batching );
3365- params.flash_attn = json_value (jparams, " flash_attn" , default_params.flash_attn );
3366- params.input_prefix_bos = json_value (jparams, " input_prefix_bos" , default_params.input_prefix_bos );
3367- params.sampling .ignore_eos = json_value (jparams, " ignore_eos" , default_params.sampling .ignore_eos );
3368- params.use_mmap = json_value (jparams, " use_mmap" , default_params.use_mmap );
3369- params.use_mlock = json_value (jparams, " use_mlock" , default_params.use_mlock );
3370- params.no_kv_offload = json_value (jparams, " no_kv_offload" , default_params.no_kv_offload );
3371- params.chat_template = json_value (jparams, " chat_template" , default_params.chat_template );
3372-
3373- if (jparams.contains (" n_gpu_layers" )) {
3374- if (llama_supports_gpu_offload ()) {
3375- params.n_gpu_layers = json_value (jparams, " n_gpu_layers" , default_params.n_gpu_layers );
3376- params.speculative .n_gpu_layers =
3377- json_value (jparams, " n_gpu_layers_draft" , default_params.speculative .n_gpu_layers );
3378- } else {
3379- SRV_WRN (" Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
3380- " See main README.md for information on enabling GPU BLAS support: %s = %d" ,
3381- " n_gpu_layers" , params.n_gpu_layers );
3382- }
3383- }
3384-
3385- if (jparams.contains (" split_mode" )) {
3386- params.split_mode = json_value (jparams, " split_mode" , default_params.split_mode );
3387- // todo: the definition checks here currently don't work due to cmake visibility reasons
3388- #ifndef GGML_USE_CUDA
3389- fprintf (stderr, " warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n " );
3390- #endif
3391- }
3392-
3393- if (jparams.contains (" tensor_split" )) {
3394- #if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
3395- std::vector<float > tensor_split = jparams[" tensor_split" ].get <std::vector<float >>();
3396- GGML_ASSERT (tensor_split.size () <= llama_max_devices ());
3397-
3398- for (size_t i_device = 0 ; i_device < llama_max_devices (); ++i_device) {
3399- if (i_device < tensor_split.size ()) {
3400- params.tensor_split [i_device] = tensor_split.at (i_device);
3401- } else {
3402- params.tensor_split [i_device] = 0 .0f ;
3403- }
3404- }
3405- #else
3406- SRV_WRN (" %s" , " llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n " );
3407- #endif // GGML_USE_CUDA
3408- }
3409-
3410- if (jparams.contains (" main_gpu" )) {
3411- #if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
3412- params.main_gpu = json_value (jparams, " main_gpu" , default_params.main_gpu );
3413- #else
3414- SRV_WRN (" %s" , " llama.cpp was compiled without CUDA. It is not possible to set a main GPU." );
3415- #endif
3416- }
3417-
3418- common_params_handle_model_default (params.model , params.model_url , params.hf_repo , params.hf_file , params.hf_token );
3419- }
0 commit comments