Skip to content
Merged

update #36972

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ jobs:
path: ~/transformers/installed.txt
- run: python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
- run: ruff check examples tests src utils
- run: ruff format tests src utils --check
- run: ruff format examples tests src utils --check
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so we won't have such situation again in the future

- run: python utils/custom_init_isort.py --check_only
- run: python utils/sort_auto_mappings.py --check_only
- run: python utils/check_doc_toc.py
Expand Down
2 changes: 1 addition & 1 deletion examples/flax/language-modeling/run_bert_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ def func():
func()
end = time.time()
print(end - start)
print(f"Throughput: {((nbenchmark * BS)/(end-start)):.3f} examples/sec")
print(f"Throughput: {((nbenchmark * BS) / (end - start)):.3f} examples/sec")
6 changes: 3 additions & 3 deletions examples/legacy/seq2seq/finetune_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,9 @@ def main():

# set decoder_start_token_id for MBart
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
assert (
data_args.tgt_lang is not None and data_args.src_lang is not None
), "mBart requires --tgt_lang and --src_lang"
assert data_args.tgt_lang is not None and data_args.src_lang is not None, (
"mBart requires --tgt_lang and --src_lang"
)
if isinstance(tokenizer, MBartTokenizer):
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang]
else:
Expand Down
2 changes: 1 addition & 1 deletion examples/legacy/seq2seq/run_eval_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def run_search():

results_sorted = sorted(results, key=operator.itemgetter(*task_score_names[task]), reverse=True)
print(" | ".join([f"{col:{col_widths[col]}}" for col in col_names]))
print(" | ".join([f"{'-'*col_widths[col]}" for col in col_names]))
print(" | ".join([f"{'-' * col_widths[col]}" for col in col_names]))
for row in results_sorted:
print(" | ".join([f"{row[col]:{col_widths[col]}}" for col in col_names]))

Expand Down
8 changes: 4 additions & 4 deletions examples/legacy/seq2seq/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,9 @@ def __init__(self, tokenizer, data_args, decoder_start_token_id, tpu_num_cores=N
self.tokenizer = tokenizer
self.pad_token_id = tokenizer.pad_token_id
self.decoder_start_token_id = decoder_start_token_id
assert (
self.pad_token_id is not None
), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
assert self.pad_token_id is not None, (
f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
)
self.data_args = data_args
self.tpu_num_cores = tpu_num_cores
self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
Expand Down Expand Up @@ -593,7 +593,7 @@ def assert_all_frozen(model):
model_grads: List[bool] = list(grad_status(model))
n_require_grad = sum(lmap(int, model_grads))
npars = len(model_grads)
assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
assert not any(model_grads), f"{n_require_grad / npars:.1%} of {npars} weights require grad"


def assert_not_all_frozen(model):
Expand Down
2 changes: 1 addition & 1 deletion examples/legacy/token-classification/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, p
s_p = preds_list[example_id]
out = ""
for token in sentence:
out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
out += f"{token['form']} ({token['upos']}|{s_p.pop(0)}) "
out += "\n"
writer.write(out)
example_id += 1
Expand Down
2 changes: 1 addition & 1 deletion examples/modular-transformers/modeling_multimodal2.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals
batch_size, _, height, width = pixel_values.shape
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
)
target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def main():
else:
model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")

# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
Expand Down
5 changes: 2 additions & 3 deletions examples/pytorch/language-modeling/run_fim.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,7 @@ class DataTrainingArguments:
default="<fim_pad>",
metadata={
"help": (
"Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. "
"Defaults to '<fim_pad>'."
"Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. Defaults to '<fim_pad>'."
)
},
)
Expand Down Expand Up @@ -514,7 +513,7 @@ def main():
attn_implementation=model_args.attn_implementation,
)
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")

# Add the new FIM tokens to the tokenizer and resize model's vocab embeddings
special_tokens = [data_args.fim_prefix_token, data_args.fim_middle_token, data_args.fim_suffix_token]
Expand Down
4 changes: 1 addition & 3 deletions examples/pytorch/language-modeling/run_fim_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,7 @@ def parse_args():
"--fim_pad_token",
type=str,
default="<fim_pad>",
help=(
"Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True." " Defaults to '<fim_pad>'."
),
help=("Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. Defaults to '<fim_pad>'."),
)
parser.add_argument(
"--preprocessing_num_workers",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def main():
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
# that could be easily picked up by the model
chars_to_ignore_regex = (
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
f"[{''.join(data_args.chars_to_ignore)}]" if data_args.chars_to_ignore is not None else None
)
text_column_name = data_args.text_column_name

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ def main():
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
# that could be easily picked up by the model
chars_to_ignore_regex = (
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
f"[{''.join(data_args.chars_to_ignore)}]" if data_args.chars_to_ignore is not None else None
)
text_column_name = data_args.text_column_name

Expand Down
6 changes: 3 additions & 3 deletions examples/pytorch/summarization/run_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,9 +505,9 @@ def main():
return

if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
assert (
data_args.lang is not None
), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
assert data_args.lang is not None, (
f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
)

tokenizer.src_lang = data_args.lang
tokenizer.tgt_lang = data_args.lang
Expand Down
12 changes: 6 additions & 6 deletions examples/pytorch/text-classification/run_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,9 @@ def __post_init__(self):
train_extension = self.train_file.split(".")[-1]
assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
validation_extension = self.validation_file.split(".")[-1]
assert (
validation_extension == train_extension
), "`validation_file` should have the same extension (csv or json) as `train_file`."
assert validation_extension == train_extension, (
"`validation_file` should have the same extension (csv or json) as `train_file`."
)


@dataclass
Expand Down Expand Up @@ -357,9 +357,9 @@ def main():
if data_args.test_file is not None:
train_extension = data_args.train_file.split(".")[-1]
test_extension = data_args.test_file.split(".")[-1]
assert (
test_extension == train_extension
), "`test_file` should have the same extension (csv or json) as `train_file`."
assert test_extension == train_extension, (
"`test_file` should have the same extension (csv or json) as `train_file`."
)
data_files["test"] = data_args.test_file
else:
raise ValueError("Need either a dataset name or a test file for `do_predict`.")
Expand Down
12 changes: 6 additions & 6 deletions examples/pytorch/text-classification/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,9 @@ def __post_init__(self):
train_extension = self.train_file.split(".")[-1]
assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
validation_extension = self.validation_file.split(".")[-1]
assert (
validation_extension == train_extension
), "`validation_file` should have the same extension (csv or json) as `train_file`."
assert validation_extension == train_extension, (
"`validation_file` should have the same extension (csv or json) as `train_file`."
)


@dataclass
Expand Down Expand Up @@ -313,9 +313,9 @@ def main():
if data_args.test_file is not None:
train_extension = data_args.train_file.split(".")[-1]
test_extension = data_args.test_file.split(".")[-1]
assert (
test_extension == train_extension
), "`test_file` should have the same extension (csv or json) as `train_file`."
assert test_extension == train_extension, (
"`test_file` should have the same extension (csv or json) as `train_file`."
)
data_files["test"] = data_args.test_file
else:
raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def main():
parser.add_argument(
"--use_cpu",
action="store_true",
help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
)
parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
parser.add_argument(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def main():
parser.add_argument(
"--use_cpu",
action="store_true",
help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
)
parser.add_argument(
"--fp16",
Expand Down
6 changes: 3 additions & 3 deletions examples/pytorch/translation/run_translation_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,9 +436,9 @@ def main():

# Set decoder_start_token_id
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
assert (
args.target_lang is not None and args.source_lang is not None
), "mBart requires --target_lang and --source_lang"
assert args.target_lang is not None and args.source_lang is not None, (
"mBart requires --target_lang and --source_lang"
)
if isinstance(tokenizer, MBartTokenizer):
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[args.target_lang]
else:
Expand Down
2 changes: 1 addition & 1 deletion examples/run_on_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
cluster.run(["pip install torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117"])

# Run example. You can bypass the CLI wrapper and paste your own code here.
cluster.run([f'python transformers/examples/{args.example} {" ".join(shlex.quote(arg) for arg in unknown)}'])
cluster.run([f"python transformers/examples/{args.example} {' '.join(shlex.quote(arg) for arg in unknown)}"])

# Alternatively, we can just import and run a training function (especially if there's no wrapper CLI):
# from my_script... import train
Expand Down
6 changes: 3 additions & 3 deletions examples/tensorflow/translation/run_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,9 +501,9 @@ def preprocess_function(examples):

# region Set decoder_start_token_id
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
assert (
data_args.target_lang is not None and data_args.source_lang is not None
), "mBart requires --target_lang and --source_lang"
assert data_args.target_lang is not None and data_args.source_lang is not None, (
"mBart requires --target_lang and --source_lang"
)
if isinstance(tokenizer, MBartTokenizer):
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
else:
Expand Down