Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented Roberta Model #65

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ivy_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,8 @@

from . import bert
from .bert import *

from . import roberta
from .roberta import *

from .vit import *
1 change: 1 addition & 0 deletions ivy_models/roberta/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .roberta import RobertaModel, roberta_base
55 changes: 55 additions & 0 deletions ivy_models/roberta/layers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import ivy
from ivy_models.bert.layers import BertEmbedding


class RobertaEmbeddings(BertEmbedding):
"""Same as Bert Embedding with tiny change in the positional indexing."""

def __init__(
self,
vocab_size,
hidden_size,
max_position_embeddings,
type_vocab_size=1,
pad_token_id=None,
embd_drop_rate=0.1,
layer_norm_eps=1e-5,
position_embedding_type="absolute",
v=None,
):
super(RobertaEmbeddings, self).__init__(
vocab_size,
hidden_size,
max_position_embeddings,
type_vocab_size,
pad_token_id,
embd_drop_rate,
layer_norm_eps,
position_embedding_type,
v,
)
self.padding_idx = 1

def _forward(
self,
input_ids,
token_type_ids=None,
position_ids=None,
past_key_values_length: int = 0,
):
input_shape = input_ids.shape
seq_length = input_shape[1]

if position_ids is None:
position_ids = ivy.expand_dims(
ivy.arange(self.padding_idx + 1, seq_length + self.padding_idx), axis=0
)
position_ids = position_ids[
:, past_key_values_length : seq_length + past_key_values_length
]
return super(RobertaEmbeddings, self)._forward(
input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
past_key_values_length=past_key_values_length,
)
91 changes: 91 additions & 0 deletions ivy_models/roberta/roberta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from ivy_models.helpers import load_transformers_weights
from ivy_models.bert import BertConfig, BertModel
from .layers import RobertaEmbeddings


class RobertaModel(BertModel):
def __init__(self, config: BertConfig, pooler_out=False):
super(RobertaModel, self).__init__(config, pooler_out=pooler_out)

@classmethod
def get_spec_class(self):
return BertConfig

def _build(self, *args, **kwargs):
self.embeddings = RobertaEmbeddings(**self.config.get_embd_attrs())
super(RobertaModel, self)._build(*args, **kwargs)

def _forward(
self,
input_ids,
attention_mask=None,
token_type_ids=None,
position_ids=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
):
if input_ids[:, 0].sum().item() != 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be to_scalar instead of item

print("NOT ALLOWED")
return super(RobertaModel, self)._forward(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
)


def _roberta_weights_mapping(name):
key_map = [(f"__v{i}__", f"__{j}__") for i, j in zip(range(12), range(12))]
key_map = key_map + [
("attention__dense", "attention.output.dense"),
("attention__LayerNorm", "attention.output.LayerNorm"),
]
key_map = key_map + [
("ffd__dense1", "intermediate.dense"),
("ffd__dense2", "output.dense"),
("ffd__LayerNorm", "output.LayerNorm"),
]
name = name.replace("__w", ".weight").replace("__b", ".bias")
name = (
name.replace("biasias", "bias")
.replace("weighteight", "weight")
.replace(".weightord", ".word")
)
for ref, new in key_map:
name = name.replace(ref, new)
name = name.replace("__", ".")
return name


def roberta_base(pretrained=True):
# instantiate the hyperparameters same as bert
# set the dropout rate to 0.0 to avoid stochasticity in the output

config = BertConfig(
vocab_size=50265,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout=0.0,
attn_drop_rate=0.0,
max_position_embeddings=514,
type_vocab_size=1,
)

model = RobertaModel(config, pooler_out=True)
if pretrained:
w_clean = load_transformers_weights(
"roberta-base", model, _roberta_weights_mapping
)
model.v = w_clean
return model
Binary file added ivy_models_tests/roberta/roberta_inputs.npy
Binary file not shown.
Binary file not shown.
26 changes: 26 additions & 0 deletions ivy_models_tests/roberta/test_roberta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
import ivy
import pytest
import numpy as np
from ivy_models import roberta_base

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you please quickly ref test_alexnet and make the model object init happen once only for saving ci resources, thanks!


@pytest.mark.parametrize("batch_shape", [[1]])
@pytest.mark.parametrize("load_weights", [False, True])
def test_roberta(device, fw, batch_shape, load_weights):
"""Test RoBerta Base Sequence Classification"""

num_dims = 768
this_dir = os.path.dirname(os.path.realpath(__file__))
input_path = os.path.join(this_dir, "roberta_inputs.npy")
inputs = np.load(input_path, allow_pickle=True).tolist()
model = roberta_base(load_weights)

inputs = {k: ivy.asarray(v) for k, v in inputs.items()}
logits = model(**inputs)["pooler_output"]
assert logits.shape == tuple([ivy.to_scalar(batch_shape), num_dims])

if load_weights:
ref_logits_path = os.path.join(this_dir, "roberta_pooled_output.npy")
ref_logits = np.load(ref_logits_path)
assert np.allclose(ref_logits, ivy.to_numpy(logits), rtol=0.005, atol=0.005)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the value tests are unfortunately not passing when I checked, could you please verify this again?
also, could you please do me a favour and update the load_transformers_weights helper to not use copy.deepcopy and do old_mapping = model.v directly? it's somewhere line 174 in the weights_helpers.py file, thanks!