adding re-ranking

Vaijanath Rao · Vaijanath Rao · commit 71373681d3b4 · 2025-03-12T12:19:56.000-07:00
diff --git a/pom.xml b/pom.xml
@@ -1,14 +1,16 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 	<modelVersion>4.0.0</modelVersion>
 
 	<groupId>de.kherud</groupId>
 	<artifactId>llama</artifactId>
-	<version>4.0.0</version>
+	<version>4.0.1</version>
 	<packaging>jar</packaging>
 
 	<name>${project.groupId}:${project.artifactId}</name>
-	<description>Java Bindings for llama.cpp - A Port of Facebook's LLaMA model in C/C++.</description>
+	<description>Java Bindings for llama.cpp - A Port of Facebook's LLaMA model
+		in C/C++.</description>
 	<url>https://github.com/kherud/java-llama.cpp</url>
 
 	<licenses>
@@ -39,7 +41,8 @@
 		</snapshotRepository>
 		<repository>
 			<id>ossrh</id>
-			<url>https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/</url>
+			<url>
+				https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/</url>
 		</repository>
 	</distributionManagement>
 
@@ -62,6 +65,7 @@
 			<version>24.1.0</version>
 			<scope>compile</scope>
 		</dependency>
+		
 	</dependencies>
 
 	<build>
@@ -71,17 +75,21 @@
 				<artifactId>maven-compiler-plugin</artifactId>
 				<version>3.13.0</version>
 				<executions>
-					<!-- We have to perform a separate build pass for cuda classifier -->
+					<!-- We have to perform a separate build pass for cuda
+					classifier -->
 					<execution>
 						<id>gpu</id>
 						<phase>compile</phase>
-						<goals><goal>compile</goal></goals>
+						<goals>
+							<goal>compile</goal>
+						</goals>
 						<configuration>
 							<compilerArgs>
 								<arg>-h</arg>
 								<arg>src/main/cpp</arg>
 							</compilerArgs>
-							<outputDirectory>${project.build.outputDirectory}_cuda</outputDirectory>
+							<outputDirectory>
+								${project.build.outputDirectory}_cuda</outputDirectory>
 						</configuration>
 					</execution>
 				</executions>
@@ -98,10 +106,12 @@
 							<goal>copy-resources</goal>
 						</goals>
 						<configuration>
-							<outputDirectory>${project.build.outputDirectory}_cuda</outputDirectory>
+							<outputDirectory>
+								${project.build.outputDirectory}_cuda</outputDirectory>
 							<resources>
 								<resource>
-									<directory>${basedir}/src/main/resources_linux_cuda/</directory>
+									<directory>
+										${basedir}/src/main/resources_linux_cuda/</directory>
 									<includes>
 										<include>**/*.*</include>
 									</includes>
@@ -176,7 +186,8 @@
 						<artifactId>maven-jar-plugin</artifactId>
 						<version>3.4.2</version>
 						<executions>
-							<!-- Pick class files AND libs from custom output directory -->
+							<!-- Pick class files AND libs from custom output
+							directory -->
 							<execution>
 								<id>cuda</id>
 								<phase>package</phase>
@@ -185,7 +196,8 @@
 								</goals>
 								<configuration>
 									<classifier>cuda12-linux-x86-64</classifier>
-									<classesDirectory>${project.build.outputDirectory}_cuda</classesDirectory>
+									<classesDirectory>
+										${project.build.outputDirectory}_cuda</classesDirectory>
 								</configuration>
 							</execution>
 						</executions>
diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
@@ -112,6 +112,26 @@ char **parse_string_array(JNIEnv *env, const jobjectArray string_array, const js
     return result;
 }
 
+std::vector<std::string> parse_string_array_for_rerank(JNIEnv *env, const jobjectArray string_array, const jsize length) {
+    std::vector<std::string> result;
+    result.reserve(length); // Reserve memory for efficiency
+
+    for (jsize i = 0; i < length; i++) {
+        jstring javaString = static_cast<jstring>(env->GetObjectArrayElement(string_array, i));
+        if (javaString == nullptr) continue;
+
+        const char *cString = env->GetStringUTFChars(javaString, nullptr);
+        if (cString != nullptr) {
+            result.emplace_back(cString); // Add to vector
+            env->ReleaseStringUTFChars(javaString, cString);
+        }
+
+        env->DeleteLocalRef(javaString); // Avoid memory leaks
+    }
+
+    return result;
+}
+
 void free_string_array(char **array, jsize length) {
     if (array != nullptr) {
         for (jsize i = 0; i < length; i++) {
@@ -239,6 +259,7 @@ JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void *reserved) {
     cc_integer = env->GetMethodID(c_integer, "<init>", "(I)V");
     cc_float = env->GetMethodID(c_float, "<init>", "(F)V");
 
+	
     if (!(cc_output && cc_hash_map && cc_integer && cc_float)) {
         goto error;
     }
@@ -634,7 +655,6 @@ JNIEXPORT jfloatArray JNICALL Java_de_kherud_llama_LlamaModel_embed(JNIEnv *env,
     json error = nullptr;
 
     server_task_result_ptr result = ctx_server->queue_results.recv(id_task);
-    ctx_server->queue_results.remove_waiting_task_id(id_task);
 
     json response_str = result->to_json();
     if (result->is_error()) {
@@ -643,6 +663,11 @@ JNIEXPORT jfloatArray JNICALL Java_de_kherud_llama_LlamaModel_embed(JNIEnv *env,
         env->ThrowNew(c_llama_error, response.c_str());
         return nullptr;
     }
+    
+    if (result->is_stop()) {
+        ctx_server->queue_results.remove_waiting_task_id(id_task);
+    }
+
 
     const auto out_res = result->to_json();
 
@@ -679,6 +704,90 @@ JNIEXPORT jfloatArray JNICALL Java_de_kherud_llama_LlamaModel_embed(JNIEnv *env,
     return j_embedding;
 }
 
+JNIEXPORT jobject JNICALL Java_de_kherud_llama_LlamaModel_rerank(JNIEnv *env, jobject obj, jstring jprompt, jobjectArray documents) {
+    jlong server_handle = env->GetLongField(obj, f_model_pointer);
+    auto *ctx_server = reinterpret_cast<server_context *>(server_handle); // NOLINT(*-no-int-to-ptr)
+
+	if (!ctx_server->params_base.reranking || ctx_server->params_base.embedding) {
+		env->ThrowNew(c_llama_error,
+                      "This server does not support reranking. Start it with `--reranking` and without `--embedding`");
+            return nullptr;
+    }
+        
+
+    const std::string prompt = parse_jstring(env, jprompt);
+
+    
+
+    const auto tokenized_query = tokenize_mixed(ctx_server->vocab, prompt, true, true);
+    
+    json responses = json::array();
+    bool error = false;
+    
+	std::vector<server_task> tasks;
+	const jsize argc = env->GetArrayLength(documents);
+	std::vector<std::string> documentsArray = parse_string_array_for_rerank(env, documents, argc);
+		 
+	std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server->vocab, documentsArray, true, true);
+		  
+	tasks.reserve(tokenized_docs.size());
+	for (size_t i = 0; i < tokenized_docs.size(); i++) {
+		server_task task   = server_task(SERVER_TASK_TYPE_RERANK);
+		task.id            = ctx_server->queue_tasks.get_new_id();
+		task.index         = i;
+		task.prompt_tokens = format_rerank(ctx_server->vocab, tokenized_query, tokenized_docs[i]);
+		tasks.push_back(task);
+	}
+	ctx_server->queue_results.add_waiting_tasks(tasks);
+	ctx_server->queue_tasks.post(tasks);
+            
+	// get the result
+	std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
+	std::vector<server_task_result_ptr> results(task_ids.size());
+	
+	// Create a new HashMap instance
+	jobject o_probabilities = env->NewObject(c_hash_map, cc_hash_map);
+	if (o_probabilities == nullptr) {
+		env->ThrowNew(c_llama_error, "Failed to create HashMap object.");
+		return nullptr;
+	}
+
+	for (int i = 0; i < (int)task_ids.size(); i++) {
+		server_task_result_ptr result = ctx_server->queue_results.recv(task_ids);
+		if (result->is_error()) {
+			std::string response = result->to_json()["message"].get<std::string>();
+			for (const int id_task : task_ids) {
+				ctx_server->queue_results.remove_waiting_task_id(id_task);
+			}
+			env->ThrowNew(c_llama_error, response.c_str());
+			return nullptr;
+		}
+    
+		const auto out_res = result->to_json();
+    	
+		std::cout << out_res.dump(4) << std::endl;
+    	
+		if (result->is_stop()) {
+			for (const int id_task : task_ids) {
+				ctx_server->queue_results.remove_waiting_task_id(id_task);
+			}
+		}
+		
+		int index = out_res["index"].get<int>();
+		float score = out_res["score"].get<float>();
+		std::string tok_str = documentsArray[index];
+		jstring jtok_str = env->NewStringUTF(tok_str.c_str());
+               
+		jobject jprob = env->NewObject(c_float, cc_float, score);
+		env->CallObjectMethod(o_probabilities, m_map_put, jtok_str, jprob);
+		env->DeleteLocalRef(jtok_str);
+		env->DeleteLocalRef(jprob);
+	}
+    jbyteArray jbytes = parse_jbytes(env, prompt);
+    return env->NewObject(c_output, cc_output, jbytes, o_probabilities, true);	
+    
+}
+
 JNIEXPORT jintArray JNICALL Java_de_kherud_llama_LlamaModel_encode(JNIEnv *env, jobject obj, jstring jprompt) {
     jlong server_handle = env->GetLongField(obj, f_model_pointer);
     auto *ctx_server = reinterpret_cast<server_context *>(server_handle); // NOLINT(*-no-int-to-ptr)
diff --git a/src/main/cpp/jllama.h b/src/main/cpp/jllama.h
diff --git a/src/main/java/de/kherud/llama/LlamaModel.java b/src/main/java/de/kherud/llama/LlamaModel.java
@@ -5,6 +5,7 @@
 
 import java.lang.annotation.Native;
 import java.nio.charset.StandardCharsets;
+import java.util.List;
 import java.util.function.BiConsumer;
 
 /**
@@ -137,4 +138,6 @@ public void close() {
 	public static String jsonSchemaToGrammar(String schema) {
 		return new String(jsonSchemaToGrammarBytes(schema), StandardCharsets.UTF_8);
 	}
+	
+	public native LlamaOutput rerank(String query, String... documents);
 }
diff --git a/src/test/java/de/kherud/llama/LlamaModelTest.java b/src/test/java/de/kherud/llama/LlamaModelTest.java
@@ -158,6 +158,26 @@ public void testEmbedding() {
 		float[] embedding = model.embed(prefix);
 		Assert.assertEquals(4096, embedding.length);
 	}
+	
+	
+	@Ignore
+	/**
+	 * To run this test download the model from here https://huggingface.co/mradermacher/jina-reranker-v1-tiny-en-GGUF/tree/main
+	 * remove .enableEmbedding() from model setup and add .enableReRanking() and then enable the test.
+	 */
+	public void testReRanking() {
+		
+		String query = "Machine learning is";
+		String [] TEST_DOCUMENTS = new String[] {
+				                  "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
+				                  "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
+				                  "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
+				                  "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine."
+		};
+		LlamaOutput llamaOutput = model.rerank(query, TEST_DOCUMENTS[0], TEST_DOCUMENTS[1], TEST_DOCUMENTS[2], TEST_DOCUMENTS[3] );
+		
+		System.out.println(llamaOutput);
+	}
 
 	@Test
 	public void testTokenization() {

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@`
`5`	`5`
`6`	`6`	`import java.lang.annotation.Native;`
`7`	`7`	`import java.nio.charset.StandardCharsets;`
	`8`	`+import java.util.List;`
`8`	`9`	`import java.util.function.BiConsumer;`
`9`	`10`
`10`	`11`	`/**`
`@@ -137,4 +138,6 @@ public void close() {`
`137`	`138`	`public static String jsonSchemaToGrammar(String schema) {`
`138`	`139`	`return new String(jsonSchemaToGrammarBytes(schema), StandardCharsets.UTF_8);`
`139`	`140`	`}`
	`141`	`+`
	`142`	`+ public native LlamaOutput rerank(String query, String... documents);`
`140`	`143`	`}`