Skip to content

Commit

Permalink
validators/url: added TLD validation + cleanup + more thorough unit-t…
Browse files Browse the repository at this point in the history
…ests
  • Loading branch information
drkameleon committed Nov 7, 2024
1 parent 8508c54 commit 0883126
Showing 1 changed file with 131 additions and 4 deletions.
135 changes: 131 additions & 4 deletions src/validators/url.art
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,97 @@ define :urlValidator is :validator [
; built-in data
;------------------

isUrl: {/^(https?:\/\/)(localhost(?::(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?|((?!-)(?!.*--)[a-zA-Z\-0-9]{1,63}(?<!-)\.)+[a-zA-Z]{2,63}|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?)(\/[^\s]*)?$/}
isUrl: {/^(https?:\/\/)(localhost(?::(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?|((?!-)(?!.*--)[a-zA-Z\-0-9]{1,63}(?<!-)\.)+([a-zA-Z]{2,63})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?)(\/[^\s]*)?$/}

; List of valid TLDs (updated as of 2024)
; This includes the most common TLDs
validTlds: [
; Infrastructure TLDs
"arpa"

; Generic TLDs
"com" "net" "org" "edu" "gov" "mil" "int" "info" "biz" "name" "pro"

; New generic TLDs
"xyz" "online" "site" "club" "shop" "app" "blog" "dev" "store" "tech"
"cloud" "digital" "space" "live" "life" "world" "email" "software"
"media" "news" "games" "marketing" "agency" "solutions" "network"

; Country code TLDs
"ac" "ad" "ae" "af" "ag" "ai" "al" "am" "ao" "aq" "ar" "as" "at" "au"
"aw" "ax" "az" "ba" "bb" "bd" "be" "bf" "bg" "bh" "bi" "bj" "bm" "bn"
"bo" "br" "bs" "bt" "bv" "bw" "by" "bz" "ca" "cc" "cd" "cf" "cg" "ch"
"ci" "ck" "cl" "cm" "cn" "co" "cr" "cu" "cv" "cw" "cx" "cy" "cz" "de"
"dj" "dk" "dm" "do" "dz" "ec" "ee" "eg" "er" "es" "et" "eu" "fi" "fj"
"fk" "fm" "fo" "fr" "ga" "gb" "gd" "ge" "gf" "gg" "gh" "gi" "gl" "gm"
"gn" "gp" "gq" "gr" "gs" "gt" "gu" "gw" "gy" "hk" "hm" "hn" "hr" "ht"
"hu" "id" "ie" "il" "im" "in" "io" "iq" "ir" "is" "it" "je" "jm" "jo"
"jp" "ke" "kg" "kh" "ki" "km" "kn" "kp" "kr" "kw" "ky" "kz" "la" "lb"
"lc" "li" "lk" "lr" "ls" "lt" "lu" "lv" "ly" "ma" "mc" "md" "me" "mg"
"mh" "mk" "ml" "mm" "mn" "mo" "mp" "mq" "mr" "ms" "mt" "mu" "mv" "mw"
"mx" "my" "mz" "na" "nc" "ne" "nf" "ng" "ni" "nl" "no" "np" "nr" "nu"
"nz" "om" "pa" "pe" "pf" "pg" "ph" "pk" "pl" "pm" "pn" "pr" "ps" "pt"
"pw" "py" "qa" "re" "ro" "rs" "ru" "rw" "sa" "sb" "sc" "sd" "se" "sg"
"sh" "si" "sj" "sk" "sl" "sm" "sn" "so" "sr" "ss" "st" "su" "sv" "sx"
"sy" "sz" "tc" "td" "tf" "tg" "th" "tj" "tk" "tl" "tm" "tn" "to" "tr"
"tt" "tv" "tw" "tz" "ua" "ug" "uk" "us" "uy" "uz" "va" "vc" "ve" "vg"
"vi" "vn" "vu" "wf" "ws" "ye" "yt" "za" "zm" "zw"

; Multi-part country TLDs
"co.uk" "co.jp" "co.nz" "co.za" "com.au" "com.br" "com.cn" "com.mx"
"com.tr" "com.sg" "com.tw" "com.hk" "com.ph" "com.my" "com.ar"
"org.uk" "net.uk" "edu.au" "gov.uk" "gov.au"
]

;------------------
; helpers
;------------------

; Extract TLD from URL
getTld: function [url][
if or? match? url {/^https?:\/\/localhost/}
match? url {/^https?:\/\/\d+\.\d+\.\d+\.\d+/} [
return null
]

parts: split.by:"/" url
domain: parts\2
if not? null? domain [
domainParts: split.by:"." domain
if (size domainParts) >= 2 [
; Try multi-part TLD first
if (size domainParts) >= 3 [
possibleTld: join.with:"." @[domainParts\[(size domainParts)-2] domainParts\[(size domainParts)-1]]
if contains? \validTlds possibleTld [
return possibleTld
]
]
; Try single-part TLD
return last domainParts
]
]
return null
]

;------------------
; methods
;------------------

action: method [str, opts][
print str
match? str \isUrl
; First check basic URL format
if not? match? str \isUrl [
return false
]

; Then validate TLD (if it's not localhost or IP)
tld: \getTld str
if not? null? tld [
if not? contains? \validTlds tld [
return false
]
]

return true
]

test: method [][
Expand Down Expand Up @@ -79,6 +161,35 @@ define :urlValidator is :validator [
"http://0.0.0.0:4000"
"http://8.8.8.8:65535" ; Maximum valid port
"http://255.255.255.255" ; Maximum valid IP

; Basic URLs with common TLDs
"https://arturo-lang.io"
"https://www.example.com/"
"http://example.net"
"https://my-site.org"

; URLs with modern TLDs
"https://my-app.dev"
"https://cool-project.xyz"
"https://my-store.shop"
"https://new-blog.blog"

; Country code TLDs
"https://website.uk"
"https://company.de"
"https://shop.jp"

; Multi-level TLDs
"https://example.co.uk"
"https://website.com.au"
"https://shop.co.jp"

; Localhost and IPs (bypass TLD check)
"http://localhost"
"http://localhost:8080"
"http://127.0.0.1"
"http://127.0.0.1:3000"
"http://192.168.1.1:8080"
]

invalid: [
Expand Down Expand Up @@ -126,7 +237,7 @@ define :urlValidator is :validator [
; Port number errors
"http://localhost:0" ; Invalid port 0
"http://localhost:65536" ; Port too high
"http://127.0.0.1:65536" ; Port too high
"http://127.0.0.1:65536" ; Port too high
"http://localhost:-80" ; Negative port
"http://localhost:abc" ; Non-numeric port
"https://127.0.0.1:" ; Port missing number
Expand All @@ -146,6 +257,22 @@ define :urlValidator is :validator [
"http://##"
"http://##/"
"http://foo.bar?q=Spaces should be encoded"

; Invalid TLDs
"https://example.invalid"
"https://website.wrongtld"
"https://test.notarealtld"
"http://something.thisisnotavalidtld"

; Malformed TLDs
"https://example."
"https://website.c"
"https://test.12"

; Invalid co.* combinations
"https://example.co.wrongcountry"
"https://website.com.invalidcountry"

]
]
]
Expand Down

0 comments on commit 0883126

Please sign in to comment.