with tokenizer.as_target_tokenizer(): print(tokenizer("Hello, this one sentence!")) model_input = tokenizer("Hello, this one sentence!") tokens = tokenizer.convert_ids_to_tokens(model_input['input_ids']) # 打印看一下special toke print('tokens: {}'.format(tokens))
defpreprocess_function(examples): inputs = [prefix + ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
# Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, truncation=True)
# Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels) result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result