Skip to content

Commit

Permalink
Merge pull request #2299 from huggingface/siglip_update
Browse files Browse the repository at this point in the history
Add i18n variant of so400m model w/ weights. Add two in1k fine-tunes
  • Loading branch information
rwightman authored Oct 9, 2024
2 parents 41a79e0 + d9321b0 commit 6ee638a
Showing 1 changed file with 85 additions and 0 deletions.
85 changes: 85 additions & 0 deletions timm/models/vision_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1817,6 +1817,11 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
hf_hub_filename='open_clip_pytorch_model.bin',
input_size=(3, 256, 256),
num_classes=0),
'vit_base_patch16_siglip_256.webli_i18n': _cfg(
hf_hub_id='timm/ViT-B-16-SigLIP-i18n-256',
hf_hub_filename='open_clip_pytorch_model.bin',
input_size=(3, 256, 256),
num_classes=0),
'vit_base_patch16_siglip_384.webli': _cfg(
hf_hub_id='timm/ViT-B-16-SigLIP-384',
hf_hub_filename='open_clip_pytorch_model.bin',
Expand All @@ -1841,6 +1846,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
hf_hub_id='timm/ViT-SO400M-14-SigLIP',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=0),
'vit_so400m_patch16_siglip_256.webli_i18n': _cfg(
hf_hub_id='timm/ViT-SO400M-16-SigLIP-i18n-256',
hf_hub_filename='open_clip_pytorch_model.bin',
input_size=(3, 256, 256),
num_classes=0),
'vit_so400m_patch14_siglip_378.webli': _cfg(
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
hf_hub_filename='open_clip_pytorch_model.bin',
input_size=(3, 378, 378),
num_classes=0),
'vit_so400m_patch14_siglip_384.webli': _cfg(
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
hf_hub_filename='open_clip_pytorch_model.bin',
Expand All @@ -1856,6 +1871,11 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
hf_hub_filename='open_clip_pytorch_model.bin',
input_size=(3, 256, 256),
num_classes=0),
'vit_base_patch16_siglip_gap_256.webli_i18n': _cfg(
hf_hub_id='timm/ViT-B-16-SigLIP-i18n-256',
hf_hub_filename='open_clip_pytorch_model.bin',
input_size=(3, 256, 256),
num_classes=0),
'vit_base_patch16_siglip_gap_384.webli': _cfg(
hf_hub_id='timm/ViT-B-16-SigLIP-384',
hf_hub_filename='open_clip_pytorch_model.bin',
Expand Down Expand Up @@ -1890,6 +1910,16 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
hf_hub_filename='paligemma-3b-pt-224.npz',
custom_load='hf',
num_classes=0),
'vit_so400m_patch16_siglip_gap_256.webli_i18n': _cfg(
hf_hub_id='timm/ViT-SO400M-16-SigLIP-i18n-256',
hf_hub_filename='open_clip_pytorch_model.bin',
input_size=(3, 256, 256),
num_classes=0),
'vit_so400m_patch14_siglip_gap_378.webli': _cfg(
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
hf_hub_filename='open_clip_pytorch_model.bin',
input_size=(3, 378, 378), crop_pct=1.0,
num_classes=0),
'vit_so400m_patch14_siglip_gap_384.webli': _cfg(
hf_hub_id='timm/ViT-SO400M-14-SigLIP-384',
hf_hub_filename='open_clip_pytorch_model.bin',
Expand All @@ -1914,6 +1944,15 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
input_size=(3, 896, 896), crop_pct=1.0,
num_classes=0),

'vit_so400m_patch14_siglip_378.webli_ft_in1k': _cfg(
hf_hub_id='timm/',
input_size=(3, 378, 378), crop_pct=1.0, crop_mode='squash',
),
'vit_so400m_patch14_siglip_gap_378.webli_ft_in1k': _cfg(
hf_hub_id='timm/',
input_size=(3, 378, 378), crop_pct=1.0, crop_mode='squash',
),

'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m': _cfg(
hf_hub_id='timm/',
hf_hub_filename='open_clip_pytorch_model.bin',
Expand Down Expand Up @@ -2935,6 +2974,28 @@ def vit_so400m_patch14_siglip_224(pretrained: bool = False, **kwargs) -> VisionT
return model


@register_model
def vit_so400m_patch16_siglip_256(pretrained: bool = False, **kwargs) -> VisionTransformer:
# this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
model_args = dict(
patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map',
)
model = _create_vision_transformer(
'vit_so400m_patch16_siglip_256', pretrained=pretrained, **dict(model_args, **kwargs))
return model


@register_model
def vit_so400m_patch14_siglip_378(pretrained: bool = False, **kwargs) -> VisionTransformer:
# this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation)
model_args = dict(
patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map',
)
model = _create_vision_transformer(
'vit_so400m_patch14_siglip_378', pretrained=pretrained, **dict(model_args, **kwargs))
return model


@register_model
def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionTransformer:
model_args = dict(
Expand Down Expand Up @@ -3023,6 +3084,30 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
return model


@register_model
def vit_so400m_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer:
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
model_args = dict(
patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362,
class_token=False, global_pool='avg', fc_norm=False,
)
model = _create_vision_transformer(
'vit_so400m_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs))
return model


@register_model
def vit_so400m_patch14_siglip_gap_378(pretrained: bool = False, **kwargs) -> VisionTransformer:
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
model_args = dict(
patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362,
class_token=False, global_pool='avg', fc_norm=False,
)
model = _create_vision_transformer(
'vit_so400m_patch14_siglip_gap_378', pretrained=pretrained, **dict(model_args, **kwargs))
return model


@register_model
def vit_so400m_patch14_siglip_gap_384(pretrained: bool = False, **kwargs) -> VisionTransformer:
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
Expand Down

0 comments on commit 6ee638a

Please sign in to comment.