Skip to content
Merged
Show file tree
Hide file tree
Changes from 122 commits
Commits
Show all changes
123 commits
Select commit Hold shift + click to select a range
672d7a0
init
ducviet00 May 17, 2025
96761e9
add modular
ducviet00 May 17, 2025
ef738f3
fixup
ducviet00 May 19, 2025
b9fd337
update configuration
ducviet00 May 19, 2025
a11c71e
add processing file
ducviet00 May 19, 2025
dc245e7
update auto files
ducviet00 May 19, 2025
8e7aa37
update
ducviet00 May 19, 2025
52e0b85
update modular
ducviet00 May 20, 2025
f022ac0
green setup_and_quality ci
ducviet00 May 20, 2025
e4674c8
it works
ducviet00 May 20, 2025
80f1378
fix some tests
ducviet00 May 20, 2025
cc32ba4
commit florence2
ducviet00 May 20, 2025
dc85010
update test
ducviet00 May 20, 2025
86d103d
make test cases done - 16 left
ducviet00 May 20, 2025
4d12924
style
ducviet00 May 20, 2025
2fc5916
fix few test cases
ducviet00 May 20, 2025
6c52d98
fix some tests
ducviet00 May 21, 2025
37bc385
Merge branch 'main' into feat_add_florence2
ducviet00 May 21, 2025
a81df22
fix init test
ducviet00 May 21, 2025
2c8095a
update florence2 vision style
ducviet00 May 22, 2025
88e0d6e
hope is green
ducviet00 May 22, 2025
505a878
fix init test
ducviet00 May 22, 2025
c2519ad
fix init
ducviet00 May 23, 2025
d9c7c1c
update modular
ducviet00 May 23, 2025
be4bcb0
refactor vision module
ducviet00 May 24, 2025
88ba6c7
fix: channel attention use dynamic scale
ducviet00 May 24, 2025
dc92ba2
Merge branch 'main' into feat_add_florence2
ducviet00 May 24, 2025
daa4170
update modular
ducviet00 May 24, 2025
ae01f45
update
ducviet00 May 25, 2025
dc35c65
update attention mask
ducviet00 May 26, 2025
019c3c5
Merge branch 'main' into feat_add_florence2
ducviet00 May 27, 2025
f761844
Merge branch 'main' into feat_add_florence2
ducviet00 May 28, 2025
c013749
update
ducviet00 May 28, 2025
68b8953
Merge branch 'main' into feat_add_florence2
ducviet00 May 30, 2025
00c5347
Merge branch 'main' into feat_add_florence2
ducviet00 Jun 3, 2025
326dd1c
fix naming
ducviet00-h2 Jun 9, 2025
576ba8a
Update src/transformers/models/florence2/processing_florence2.py
ducviet00 Jun 9, 2025
ccf8e17
Merge branch 'main' into feat_add_florence2
ducviet00 Jun 11, 2025
5475908
spatial block works
ducviet00-h2 Jun 12, 2025
5bed246
more beautiful
ducviet00-h2 Jun 12, 2025
ee8e823
more more beautiful
ducviet00-h2 Jun 20, 2025
5ce23f4
Merge branch 'main' into feat_add_florence2
ducviet00 Jun 20, 2025
30c9e97
merge main
ducviet00 Jun 20, 2025
e0cfb35
merge main and fixup
ducviet00 Jun 20, 2025
317232e
fix typing hint
ducviet00 Jun 20, 2025
af63c68
Merge branch 'main' into feat_add_florence2
ducviet00 Jun 28, 2025
e557dc6
update modeling
ducviet00 Jun 28, 2025
8a5e7dc
fix eager matches sdpa
ducviet00 Jun 29, 2025
223b04e
fix style
ducviet00 Jun 29, 2025
e2fc7de
fix compile test - all green
ducviet00 Jun 30, 2025
b2809a4
remove florence2 language
ducviet00 Jun 30, 2025
9184b44
remove Florence2LanguageModel things
ducviet00 Jun 30, 2025
93056d3
fix style
ducviet00 Jun 30, 2025
aa88d1f
Merge branch 'main' into feat_add_florence2
ducviet00 Jun 30, 2025
19978e8
update florence2 model
ducviet00 Jul 1, 2025
aa5d795
Merge branch 'main' into feat_add_florence2
ducviet00 Jul 9, 2025
7da8bed
override prepare encoder_decoder for generation
ducviet00 Jul 10, 2025
8ccedcf
add weight conversion script
ducviet00 Jul 10, 2025
7eac0a5
rewrite channel attention to use sdpa
ducviet00 Jul 11, 2025
c10af87
eleminate 1 tranpose op
ducviet00 Jul 11, 2025
b01e3a9
support fa2
ducviet00-h2 Jul 11, 2025
7c2cf80
fix quality check
ducviet00 Jul 11, 2025
ec7229c
chore: reformat `test_modeling_florence2.py`
ducviet00 Jul 11, 2025
6fbbe30
some refactor for processor
ducviet00 Jul 13, 2025
c2ee7b6
some refactor for processor
ducviet00 Jul 13, 2025
f6937b0
Merge branch 'main' into feat_add_florence2
ducviet00 Jul 15, 2025
89ef8b9
update naming convention and remove BC
ducviet00 Jul 15, 2025
7fa4895
make it pass the test
ducviet00 Jul 15, 2025
e53f5b2
fix: correct Embedding Cosine
ducviet00 Jul 15, 2025
9ee7917
update comments and docstring
ducviet00 Jul 15, 2025
beb1ac5
support input_embeds
ducviet00 Jul 16, 2025
b299250
support input embeds ideally
ducviet00 Jul 16, 2025
a8d2f04
fix style
ducviet00 Jul 16, 2025
e71c2ec
fix style
ducviet00 Jul 16, 2025
a138e9e
fix style again :D
ducviet00 Jul 17, 2025
6442077
add test prcoessor
ducviet00 Jul 26, 2025
7451af7
refactor processor and add test for processor
ducviet00 Jul 26, 2025
78c8bf2
reformat test processor
ducviet00 Jul 26, 2025
de5c991
Merge remote-tracking branch 'upstream/main' into feat_add_florence2
ducviet00 Jul 26, 2025
13f536b
make fixup
ducviet00 Jul 26, 2025
60b4b0d
fix schema check
ducviet00 Jul 26, 2025
ba89428
Merge branch 'main' into feat_add_florence2
ducviet00 Jul 28, 2025
27d945c
remove image_token
ducviet00 Jul 29, 2025
af8077e
Merge branch 'main' into feat_add_florence2
ducviet00 Jul 30, 2025
2d782e4
Merge branch 'main' into feat_add_florence2
ducviet00 Jul 31, 2025
97d7277
ensure image token in tokenizer and fix integration tests
ducviet00-h2 Jul 31, 2025
9c8c553
fix processor test
ducviet00-h2 Jul 31, 2025
fe84671
add more integration tests for large model and rename test_processor …
ducviet00-h2 Jul 31, 2025
70475c4
test_assisted_decoding_sample should pass
ducviet00-h2 Aug 1, 2025
bd248f4
update doc and make model work with image text to text pipeline
ducviet00-h2 Aug 1, 2025
4bf97fc
docs: add sdpa bagde
ducviet00-h2 Aug 1, 2025
9672bcb
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 5, 2025
ca0e4e1
resolve cyril's comments
ducviet00-h2 Aug 5, 2025
0d8e313
fix import torch error
ducviet00-h2 Aug 5, 2025
10f3085
Merge branch 'huggingface:main' into feat_add_florence2
ducviet00 Aug 5, 2025
51512ae
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 5, 2025
a04b959
add helper get_placeholder_mask
ducviet00-h2 Aug 5, 2025
82f6371
inherit from llava
ducviet00-h2 Aug 5, 2025
f0458d7
florence2 may not _supports_attention_backend because of bart ...
ducviet00-h2 Aug 5, 2025
e498af8
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 6, 2025
d2a2a74
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 6, 2025
1eef7c8
move florence2 model card to multimodal
ducviet00-h2 Aug 6, 2025
7a0c1e0
let base model always return_dict
ducviet00-h2 Aug 7, 2025
97be161
fix style
ducviet00-h2 Aug 7, 2025
f6bb575
tiny update doc
ducviet00-h2 Aug 7, 2025
3adcadd
set _checkpoint_conversion_mapping = {}
ducviet00-h2 Aug 7, 2025
1e70103
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 12, 2025
a8655f6
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 12, 2025
a98d2ab
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 12, 2025
5d79f54
fix code quality
ducviet00-h2 Aug 12, 2025
5b2b504
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 13, 2025
936e970
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 13, 2025
c6a7534
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 14, 2025
8fc8eae
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 15, 2025
ca8cc31
support flex and compile graph and move external func to internal func
ducviet00-h2 Aug 18, 2025
698b97c
remove condition because it always true
ducviet00-h2 Aug 18, 2025
7f8999e
remove window funcs
ducviet00-h2 Aug 18, 2025
3ca3cbf
move post processor config out
ducviet00-h2 Aug 18, 2025
7e9decf
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 18, 2025
2561b2b
fix ci
ducviet00-h2 Aug 18, 2025
0d7031d
new intro to trigger test
ducviet00 Aug 18, 2025
5eb389f
Merge branch 'main' into feat_add_florence2
ducviet00 Aug 18, 2025
478ee13
remove `kernel_size` argument
ducviet00 Aug 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/en/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1005,6 +1005,8 @@
title: Evolla
- local: model_doc/flava
title: FLAVA
- local: model_doc/florence2
title: Florence2
- local: model_doc/gemma3
title: Gemma3
- local: model_doc/gemma3n
Expand Down
185 changes: 185 additions & 0 deletions docs/source/en/model_doc/florence2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.

-->

<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
</div>

# Florence-2

[Florence-2](https://arxiv.org/abs/2311.06242) is an advanced vision foundation model that uses a prompt-based approach to handle a wide range of vision and vision-language tasks. Florence-2 can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation. It leverages the FLD-5B dataset, containing 5.4 billion annotations across 126 million images, to master multi-task learning. The model's sequence-to-sequence architecture enables it to excel in both zero-shot and fine-tuned settings, proving to be a competitive vision foundation model.

You can find all the original Florence-2 checkpoints under the [Florence-2](https://huggingface.co/models?other=florence-2) collection.

> [!TIP]
> This model was contributed by [ducviet00](https://huggingface.co/ducviet00).
> Click on the Florence-2 models in the right sidebar for more examples of how to apply Florence-2 to different vision and language tasks.

The example below demonstrates how to perform object detection with [`Pipeline`] or the [`AutoModel`] class.

<hfoptions id="usage">
<hfoption id="Pipeline">

```py
import torch
import requests
from PIL import Image
from transformers import pipeline

pipeline = pipeline(
"image-text-to-text",
model="ducviet00/Florence-2-base-hf",
device=0,
torch_dtype=torch.bfloat16
)

pipeline(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true",
text="<OD>"
)
```

</hfoption>
<hfoption id="AutoModel">

```py
import torch
import requests
from PIL import Image
from transformers import AutoProcessor, Florence2ForConditionalGeneration

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

model = Florence2ForConditionalGeneration.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch.bfloat16, device_map="auto")
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base")

task_prompt = "<OD>"
inputs = processor(text=task_prompt, images=image, return_tensors="pt").to(model.device)

generated_ids = model.generate(
**inputs,
max_new_tokens=1024,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

image_size = image.size
parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=image_size)
print(parsed_answer)
```

</hfoption>
</hfoptions>

Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the model to 4-bit.

```py
# pip install bitsandbytes
import torch
import requests
from PIL import Image
from transformers import AutoProcessor, Florence2ForConditionalGeneration, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = Florence2ForConditionalGeneration.from_pretrained(
"microsoft/Florence-2-large",
torch_dtype=torch.bfloat16,
device_map="auto",
quantization_config=quantization_config
)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large")

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

task_prompt = "<OD>"
inputs = processor(text=task_prompt, images=image, return_tensors="pt").to(model.device, torch.bfloat16)

generated_ids = model.generate(
**inputs,
max_new_tokens=1024,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

image_size = image.size
parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=image_size)

print(parsed_answer)
```

<div class="flex justify-center">
<img src=""/>
</div>

## Notes

- Florence-2 is a prompt-based model. You need to provide a task prompt to tell the model what to do. Supported tasks are:
- `<OCR>`
- `<OCR_WITH_REGION>`
- `<CAPTION>`
- `<DETAILED_CAPTION>`
- `<MORE_DETAILED_CAPTION>`
- `<OD>`
- `<DENSE_REGION_CAPTION>`
- `<CAPTION_TO_PHRASE_GROUNDING>`
- `<REFERRING_EXPRESSION_SEGMENTATION>`
- `<REGION_TO_SEGMENTATION>`
- `<OPEN_VOCABULARY_DETECTION>`
- `<REGION_TO_CATEGORY>`
- `<REGION_TO_DESCRIPTION>`
- `<REGION_TO_OCR>`
- `<REGION_PROPOSAL>`
- The raw output of the model is a string that needs to be parsed. The [`Florence2Processor`] has a [`~Florence2Processor.post_process_generation`] method that can parse the string into a more usable format, like bounding boxes and labels for object detection.

## Resources

- [Florence-2 technical report](https://arxiv.org/abs/2311.06242)
- [Jupyter Notebook for inference and visualization of Florence-2-large model](https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb)

## Florence2VisionConfig

[[autodoc]] Florence2VisionConfig

## Florence2Config

[[autodoc]] Florence2Config

## Florence2Processor

[[autodoc]] Florence2Processor

## Florence2Model

[[autodoc]] Florence2Model
- forward

## Florence2ForConditionalGeneration

[[autodoc]] Florence2ForConditionalGeneration
- forward

## Florence2VisionBackbone

[[autodoc]] Florence2VisionBackbone
- forward
1 change: 1 addition & 0 deletions src/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@
from .fastspeech2_conformer import *
from .flaubert import *
from .flava import *
from .florence2 import *
from .fnet import *
from .focalnet import *
from .fsmt import *
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/auto/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGanConfig"),
("flaubert", "FlaubertConfig"),
("flava", "FlavaConfig"),
("florence2", "Florence2Config"),
("fnet", "FNetConfig"),
("focalnet", "FocalNetConfig"),
("fsmt", "FSMTConfig"),
Expand Down Expand Up @@ -563,6 +564,7 @@
("flan-ul2", "FLAN-UL2"),
("flaubert", "FlauBERT"),
("flava", "FLAVA"),
("florence2", "Florence2"),
("fnet", "FNet"),
("focalnet", "FocalNet"),
("fsmt", "FairSeq Machine-Translation"),
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGan"),
("flaubert", "FlaubertModel"),
("flava", "FlavaModel"),
("florence2", "Florence2Model"),
("fnet", "FNetModel"),
("focalnet", "FocalNetModel"),
("fsmt", "FSMTModel"),
Expand Down Expand Up @@ -436,6 +437,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
("falcon_mamba", "FalconMambaForCausalLM"),
("flaubert", "FlaubertWithLMHeadModel"),
("flava", "FlavaForPreTraining"),
("florence2", "Florence2ForConditionalGeneration"),
("fnet", "FNetForPreTraining"),
("fsmt", "FSMTForConditionalGeneration"),
("funnel", "FunnelForPreTraining"),
Expand Down Expand Up @@ -978,6 +980,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
("deepseek_vl_hybrid", "DeepseekVLHybridForConditionalGeneration"),
("emu3", "Emu3ForConditionalGeneration"),
("evolla", "EvollaForProteinText2Text"),
("florence2", "Florence2ForConditionalGeneration"),
("fuyu", "FuyuForCausalLM"),
("gemma3", "Gemma3ForConditionalGeneration"),
("gemma3n", "Gemma3nForConditionalGeneration"),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
("emu3", "Emu3Processor"),
("evolla", "EvollaProcessor"),
("flava", "FlavaProcessor"),
("florence2", "Florence2Processor"),
("fuyu", "FuyuProcessor"),
("gemma3", "Gemma3Processor"),
("gemma3n", "Gemma3nProcessor"),
Expand Down
28 changes: 28 additions & 0 deletions src/transformers/models/florence2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING

from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure


if TYPE_CHECKING:
from .configuration_florence2 import *
from .modeling_florence2 import *
from .processing_florence2 import *
else:
import sys

_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
Loading