library(readxl)
library(ggbeeswarm)
library(pdftools)
library(stringi)
library(hrbrthemes)
library(ggrepel)
library(tidyverse)
tibble(
year = 2014:2018,
complaints = c(269422, 288012, 298728, 301580, 351937),
losses = c(800.5, 1070.7, 1450.7, 1418.7, 2706.4)
) -> ic3_summary
write_csv(ic3_summary, here::here("data/2018-fbi-ic3-annual-summary.csv"))
ggplot(ic3_summary, aes(complaints, losses)) +
geom_path(
arrow = arrow(type = "closed", length = unit(12, "pt")),
color = "#31739C"
) +
geom_point(color = "#31739C") +
geom_label_repel(
aes(label = year), family = font_rc, size = c(rep(3, 4), 4),
color = c(rep("#3B454A", 4), "black"),
fontface = c(rep("plain", 4), "bold")
) +
scale_x_comma(limits = c(0, NA)) +
scale_y_continuous(label = scales::dollar, limits = c(0, NA)) +
labs(
x = "Number of Complaints", y = "Losses (USD, millions)",
title = "Both Incident Count and Total Losses Related to Cybercrime\nSkyrocketed in the 2018 Edition of the FBI IC3 Report",
subtitle = "Zero baseline; Point labels denote IC3 summary year data",
caption = "Source: 2018 FBI IC3; Page 5 'IC3 Complaint Statistics 2014-2018'"
) +
theme_ipsum_rc()
ic3 <- pdf_text(here::here("raw/2018_IC3Report.pdf"))
ic3[[16]] %>%
stri_split_lines() %>%
unlist() %>%
stri_trim_both() -> l
l[which(stri_detect_regex(l, "^Under")):which(stri_detect_regex(l, "^Over 6"))] %>%
stri_split_regex("[[:space:]]{3,}", simplify = TRUE) %>%
as.data.frame(stringsAsFactors = FALSE) %>%
set_names("age_group", "incidents", "losses") %>%
as_tibble() %>%
mutate(losses = stri_replace_first_fixed(losses, "$", "")) %>%
type_convert(
col_types = cols(
age_group = col_character(),
incidents = col_number(),
losses = col_number()
)
) -> loss
write_csv(loss, here::here("data/2018-fbi-ic3-loss-by-age.csv"))
mutate(loss, `Loss Ratio (USD, milions)` = losses/incidents) %>%
rename(
`Total Losses (USD, milions)` = losses,
`Total Incidents` = incidents
) %>%
mutate(age_group = stri_replace_first_fixed(age_group, " ", "\n")) %>%
mutate(age_group = factor(age_group, age_group)) %>%
gather(measure, value, -age_group) %>%
mutate(
measure = factor(measure, levels = c(
"Total Incidents", "Total Losses (USD, milions)", "Loss Ratio (USD, milions)"))
) %>%
ggplot(aes(age_group, value)) +
geom_col(width=0.45, fill = "#31739C") +
scale_x_discrete() +
scale_y_comma() +
facet_wrap(~measure, scales = "free") +
labs(
x = NULL, y = "Loss Ratio (total losses/victim count)",
title = "In 2018, Older Victims Generally Lost More Overall and Per-Incident Than Younger Victims",
subtitle = "Note that 40-49 age group had more incients than older groups but fewer overall losses.",
caption = "NOTE: Free Y Scale\nSource: 2018 FBI IC3; Page 16 '2018 Victims by Age Group'"
) +
theme_ipsum_rc(grid="Y")
ic3[[19]] %>%
stri_split_lines() %>%
unlist() %>%
keep(stri_detect_regex, "^[[:upper:]]") %>%
keep(stri_detect_regex, "[[:digit:]]") %>%
stri_replace_first_regex("([[:digit:]]) ([[:alpha:]\\*])", "$1\t$2") %>%
stri_split_fixed("\t") %>%
unlist() %>%
keep(stri_detect_regex, " [[:digit:]]") %>%
stri_match_first_regex("([^[:digit:]]+)([[:digit:],]+)$") %>%
.[,2:3] %>%
as.data.frame(stringsAsFactors=FALSE) %>%
as_tibble() %>%
mutate_all(.funs = stri_trim_both) %>%
type_convert(
col_types = cols(
V1 = col_character(),
V2 = col_number()
)
) %>%
mutate(V1 = case_when(
stri_detect_fixed(V1, "IPR/Copy") ~ "IPR/Copyright and Counterfeit",
TRUE ~ V1
)) %>%
set_names(c("crime", "victim_count")) %>%
head(-2) %>%
arrange(desc(victim_count)) -> victims
ic3[[20]] %>%
stri_split_lines() %>%
unlist() %>%
keep(stri_detect_regex, "^ [[:upper:]]") %>%
keep(stri_detect_regex, "[[:digit:]]") %>%
stri_trim_both() %>%
stri_replace_first_regex("([[:digit:]]) ([[:alpha:]\\*])", "$1\t$2") %>%
stri_split_fixed("\t") %>%
unlist() %>%
stri_replace_first_regex("([[:digit:]])[[:space:]]+([[:alpha:]\\*])", "$1\t$2") %>%
stri_split_fixed("\t") %>%
unlist() %>%
keep(stri_detect_regex, "\\$[[:digit:]]") %>%
stri_match_first_regex("([^\\$]+)([\\$[:digit:],\\.]+)$") %>%
.[,2:3] %>%
as.data.frame(stringsAsFactors=FALSE) %>%
as_tibble() %>%
mutate_all(.funs = stri_trim_both) %>%
mutate(V2 = stri_replace_first_fixed(V2, "$", "")) %>%
type_convert(
col_types = cols(
V1 = col_character(),
V2 = col_number()
)
) %>%
mutate(V1 = case_when(
stri_detect_fixed(V1, "IPR/Copy") ~ "IPR/Copyright and Counterfeit",
stri_detect_fixed(V1, "Malware/Sca") ~ "Malware/Scareware/Virus",
stri_detect_fixed(V1, "Harassment/T") ~ "Harassment/Threats of Violence",
stri_detect_fixed(V1, "Ransomware") ~ "Ransomware",
stri_detect_fixed(V1, "Denial of Service") ~ "Denial of Service/TDoS",
stri_detect_fixed(V1, "Re-Shipping") ~ "Re-shipping",
TRUE ~ V1
)) %>%
set_names(c("crime", "loss")) %>%
head(-2) %>%
left_join(victims, "crime") %>%
mutate(loss = loss / 1000000) -> crime_types
write_csv(crime_types, here::here("data/2018-fbi-ic3-loss-by-crime-type.csv"))
ggplot() +
geom_point(
data = mutate(crime_types, color = case_when(
(loss >= 100) | (victim_count >= 20000) ~ "#E85E26",
TRUE ~ "#31739C"
)),
aes(victim_count, loss, color = I(color))
) +
geom_label_repel(
data = filter(crime_types, (loss >= 100) | (victim_count >= 20000)),
aes(victim_count, loss, label = crime),
size = 3, family = font_rc
) +
scale_x_comma() +
scale_y_continuous(label = scales::dollar) +
labs(
x = "Victim count", y = "Loss (USD, millions)",
title = "[Business] E-mail Account Compromise was the Most Profitable\nIC3 Crime in 2018 with over $1.2 billion (USD) in Losses",
subtitle = "Markers only on IC3 crimes with ≥$100m (USD) losses or ≥20,000 victims ",
caption = "Source: 2018 FBI IC3; Pages 19-20 '2018 Crime Types'"
) +
theme_ipsum_rc(grid="XY")
arrange(crime_types, desc(loss)) %>%
select(`Crime` = 1, `Loss (USD, millions)` = 2, `Victim Count` = 3) %>%
gt::gt() %>%
gt::fmt_number("Victim Count", decimals = 0) %>%
gt::fmt_currency("Loss (USD, millions)", decimals = 2)
Crime |
Loss (USD, millions) |
Victim Count |
---|---|---|
BEC/EAC |
$1,297.80 |
20,373 |
Confidence Fraud/Romance |
$362.50 |
18,493 |
Investment |
$252.96 |
3,693 |
Non-Payment/Non-Delivery |
$183.83 |
65,116 |
Real Estate/Rental |
$149.46 |
11,300 |
Personal Data Breach |
$148.89 |
50,642 |
Corporate Data Breach |
$117.71 |
2,480 |
Identity Theft |
$100.43 |
16,128 |
Advanced Fee |
$92.27 |
16,362 |
Credit Card Fraud |
$88.99 |
15,210 |
Extortion |
$83.36 |
51,146 |
Spoofing |
$70.00 |
15,569 |
Government Impersonation |
$64.21 |
10,978 |
Other |
$63.13 |
10,826 |
Lottery/Sweepstakes |
$60.21 |
7,146 |
Overpayment |
$53.23 |
15,512 |
Phishing/Vishing/Smishing/Pharming |
$48.24 |
26,379 |
Employment |
$45.49 |
14,979 |
Tech Support |
$38.70 |
14,408 |
Harassment/Threats of Violence |
$21.90 |
18,415 |
Misrepresentation |
$20.00 |
5,959 |
IPR/Copyright and Counterfeit |
$15.80 |
2,249 |
Civil Matter |
$15.17 |
768 |
Malware/Scareware/Virus |
$7.41 |
2,811 |
Health Care Related |
$4.47 |
337 |
Ransomware |
$3.62 |
1,493 |
Denial of Service/TDoS |
$2.05 |
1,799 |
Re-shipping |
$1.68 |
907 |
Charity |
$1.01 |
493 |
Gambling |
$0.93 |
181 |
Crimes Against Children |
$0.27 |
1,394 |
Hacktivist |
$0.08 |
77 |
Terrorism |
$0.01 |
120 |
No Lead Value |
$0.00 |
36,936 |
ic3[[21]] %>%
stri_split_lines() %>%
unlist() %>%
keep(stri_detect_regex, "^[[:digit:]]") %>%
stri_replace_first_regex("([[:digit:],]+)[[:space:]]+([[:digit:]])", "$1\t$2") %>%
stri_split_fixed("\t") %>%
unlist() %>%
stri_replace_first_regex("^[[:digit:] ]+", "") %>%
stri_replace_first_regex("[[:space:]]+([[:digit:]])", "\t$1") %>%
stri_split_fixed("\t", simplify = TRUE) %>%
as.data.frame(stringsAsFactors=FALSE) %>%
as_tibble() %>%
mutate_all(.funs = stri_trim_both) %>%
mutate(V1 = case_when(
stri_detect_fixed(V1, "Northern Marina") ~ "Northern Mariana Islands",
TRUE ~ V1
)) %>%
type_convert(
col_types = cols(
V1 = col_character(),
V2 = col_number()
)
) %>%
set_names(c("state", "victim_count")) -> state_vics
ic3[[23]] %>%
stri_split_lines() %>%
unlist() %>%
keep(stri_detect_regex, "^[[:space:]]+[[:digit:]]") %>%
stri_replace_first_regex("([[:digit:],]+)[[:space:]]+([[:digit:]])", "$1\t$2") %>%
stri_split_fixed("\t") %>%
unlist() %>%
stri_replace_first_regex("^[[:digit:] ]+", "") %>%
stri_replace_first_regex("[[:space:]]+([[:digit:]])", "\t$1") %>%
stri_split_fixed("\t", simplify = TRUE) %>%
as.data.frame(stringsAsFactors=FALSE) %>%
as_tibble() %>%
mutate_all(.funs = stri_trim_both) %>%
mutate(V1 = case_when(
stri_detect_fixed(V1, "Northern Marina") ~ "Northern Mariana Islands",
TRUE ~ V1
)) %>%
type_convert(
col_types = cols(
V1 = col_character(),
V2 = col_number()
)
) %>%
set_names(c("state", "subject_earnings")) -> subj_earnings
ic3[[22]] %>%
stri_split_lines() %>%
unlist() %>%
keep(stri_detect_regex, "^ [[:digit:]]") %>%
stri_trim_both() %>%
stri_replace_first_regex("([[:digit:],]+)[[:space:]]+([[:digit:]])", "$1\t$2") %>%
stri_split_fixed("\t") %>%
unlist() %>%
stri_replace_first_regex("^[[:digit:] ]+", "") %>%
stri_replace_first_regex("[[:space:]]+\\$([[:digit:]])", "\t$1") %>%
stri_split_fixed("\t", simplify = TRUE) %>%
as.data.frame(stringsAsFactors=FALSE) %>%
as_tibble() %>%
mutate_all(.funs = stri_trim_both) %>%
type_convert(
col_types = cols(
V1 = col_character(),
V2 = col_number()
)
) %>%
set_names(c("state", "loss")) %>%
left_join(subj_earnings, "state") %>%
left_join(state_vics, "state") %>%
mutate(subject_earnings = subject_earnings / 1000000) %>%
mutate(loss = loss / 1000000) -> by_state
if (!file.exists(here::here("raw/2018-pop-est.xlsx"))) {
download.file(
url = "https://www2.census.gov/programs-surveys/popest/tables/2010-2018/national/totals/nst-est2018-01.xlsx",
destfile = here::here("raw/2018-pop-est.xlsx")
)
}
read_excel(here::here("raw/2018-pop-est.xlsx"), skip=9, col_names = FALSE) %>%
select(state = 1, pop_2018 = 12) %>%
mutate(state = stri_replace_first_fixed(state, ".", "")) %>%
filter(!is.na(state), !is.na(pop_2018)) %>%
add_row(state = "U.S. Virgin Islands", pop_2018 = 104914) %>%
add_row(state = "Guam", pop_2018 = 165718) %>%
add_row(state = "U.S. Minor Outlying Islands", pop_2018 = 270) %>%
add_row(state = "American Samoa", pop_2018 = 55679) %>%
add_row(state = "Northern Mariana Islands", pop_2018 = 55194) -> pops
left_join(by_state, pops, "state") %>%
mutate(
loss_per_vic = loss/victim_count,
frac = victim_count / pop_2018
) -> by_state
write_csv(by_state, here::here("data/2018-fbi-ic3-loss-by-state.csv"))
arrange(by_state, desc(frac)) %>%
mutate(loss_per_vic = loss_per_vic * 1000000) %>%
select(
`State` = 1,
`Victim Count` = 4,
`Loss (USD, millions)` = 3,
`Loss per Victim (USD)` = 6,
`% Population Impacted` = 7,
`Subject Earnings (USD, millions)` = 2
) %>%
gt::gt() %>%
gt::fmt_number("Victim Count", decimals = 0) %>%
gt::fmt_currency("Loss (USD, millions)", decimals = 2) %>%
gt::fmt_currency("Loss per Victim (USD)", decimals = 2) %>%
gt::fmt_percent("% Population Impacted", decimals = 4) %>%
gt::fmt_currency("Subject Earnings (USD, millions)", decimals = 2)
State |
Victim Count |
Loss (USD, millions) |
Loss per Victim (USD) |
% Population Impacted |
Subject Earnings (USD, millions) |
---|---|---|---|---|---|
U.S. Minor Outlying Islands |
47 |
$0.00 |
$2,049.91 |
17.4074% |
$0.10 |
Alaska |
1,603 |
$0.00 |
$2,256.30 |
0.2174% |
$3.62 |
District of Columbia |
1,364 |
$0.00 |
$6,524.80 |
0.1942% |
$8.90 |
Virginia |
14,800 |
$0.01 |
$2,958.95 |
0.1738% |
$43.79 |
Nevada |
5,228 |
$0.00 |
$5,531.93 |
0.1723% |
$28.92 |
Colorado |
9,328 |
$0.00 |
$3,653.82 |
0.1638% |
$34.08 |
Maryland |
8,777 |
$0.00 |
$5,375.44 |
0.1452% |
$47.18 |
Washington |
10,775 |
$0.00 |
$5,616.07 |
0.1430% |
$60.51 |
California |
49,031 |
$0.02 |
$9,187.70 |
0.1240% |
$450.48 |
Wisconsin |
6,621 |
$0.00 |
$3,722.89 |
0.1139% |
$24.65 |
Florida |
23,984 |
$0.01 |
$7,427.51 |
0.1126% |
$178.14 |
Arizona |
8,027 |
$0.00 |
$5,626.77 |
0.1119% |
$45.17 |
Oregon |
4,511 |
$0.00 |
$6,340.05 |
0.1076% |
$28.60 |
New Mexico |
2,127 |
$0.00 |
$4,051.61 |
0.1015% |
$8.62 |
Rhode Island |
1,028 |
$0.00 |
$6,740.27 |
0.0972% |
$6.93 |
Utah |
3,041 |
$0.00 |
$6,779.82 |
0.0962% |
$20.62 |
New Jersey |
8,440 |
$0.00 |
$9,444.52 |
0.0947% |
$79.71 |
Alabama |
4,585 |
$0.00 |
$3,688.35 |
0.0938% |
$16.91 |
Delaware |
897 |
$0.00 |
$3,502.11 |
0.0927% |
$3.14 |
New York |
18,124 |
$0.01 |
$11,095.24 |
0.0927% |
$201.09 |
Missouri |
5,508 |
$0.00 |
$4,643.74 |
0.0899% |
$25.58 |
Massachusetts |
6,173 |
$0.00 |
$11,054.95 |
0.0894% |
$68.24 |
Texas |
25,589 |
$0.01 |
$7,644.34 |
0.0892% |
$195.61 |
Connecticut |
3,134 |
$0.00 |
$12,080.38 |
0.0877% |
$37.86 |
Georgia |
9,095 |
$0.00 |
$6,758.33 |
0.0865% |
$61.47 |
Idaho |
1,513 |
$0.00 |
$4,529.54 |
0.0862% |
$6.85 |
Wyoming |
497 |
$0.00 |
$9,088.79 |
0.0860% |
$4.52 |
Vermont |
525 |
$0.00 |
$4,052.03 |
0.0838% |
$2.13 |
Tennessee |
5,584 |
$0.00 |
$5,120.06 |
0.0825% |
$28.59 |
Pennsylvania |
10,554 |
$0.00 |
$5,940.19 |
0.0824% |
$62.69 |
Illinois |
10,087 |
$0.00 |
$8,213.52 |
0.0792% |
$82.85 |
New Hampshire |
1,056 |
$0.00 |
$5,761.96 |
0.0778% |
$6.08 |
Hawaii |
1,100 |
$0.00 |
$5,873.44 |
0.0774% |
$6.46 |
Minnesota |
4,304 |
$0.00 |
$11,341.56 |
0.0767% |
$48.81 |
Michigan |
7,533 |
$0.00 |
$10,743.37 |
0.0754% |
$80.93 |
Louisiana |
3,469 |
$0.00 |
$4,726.51 |
0.0744% |
$16.40 |
Montana |
787 |
$0.00 |
$8,401.60 |
0.0741% |
$6.61 |
North Carolina |
7,523 |
$0.00 |
$18,241.52 |
0.0725% |
$137.23 |
Kansas |
2,098 |
$0.00 |
$8,329.25 |
0.0721% |
$17.47 |
South Carolina |
3,575 |
$0.00 |
$5,473.54 |
0.0703% |
$19.57 |
Indiana |
4,676 |
$0.00 |
$6,325.43 |
0.0699% |
$29.58 |
Oklahoma |
2,644 |
$0.00 |
$4,382.72 |
0.0671% |
$11.59 |
Ohio |
7,812 |
$0.00 |
$12,510.25 |
0.0668% |
$97.73 |
Mississippi |
1,882 |
$0.00 |
$3,041.99 |
0.0630% |
$5.73 |
Kentucky |
2,813 |
$0.00 |
$3,324.84 |
0.0630% |
$9.35 |
Iowa |
1,983 |
$0.00 |
$7,734.73 |
0.0628% |
$15.34 |
Nebraska |
1,205 |
$0.00 |
$7,822.97 |
0.0625% |
$9.43 |
Maine |
832 |
$0.00 |
$3,244.89 |
0.0622% |
$2.70 |
U.S. Virgin Islands |
65 |
$0.00 |
$41,904.46 |
0.0620% |
$2.72 |
West Virginia |
1,109 |
$0.00 |
$7,483.10 |
0.0614% |
$8.30 |
Arkansas |
1,849 |
$0.00 |
$3,770.43 |
0.0614% |
$6.97 |
North Dakota |
459 |
$0.00 |
$5,003.90 |
0.0604% |
$2.30 |
South Dakota |
465 |
$0.00 |
$3,728.66 |
0.0527% |
$1.73 |
Guam |
52 |
$0.00 |
$2,981.83 |
0.0314% |
$0.16 |
American Samoa |
16 |
$0.00 |
$1,158.56 |
0.0287% |
$0.02 |
Northern Mariana Islands |
15 |
$0.00 |
$924.33 |
0.0272% |
$0.01 |
Puerto Rico |
704 |
$0.00 |
$7,413.48 |
0.0220% |
$5.22 |
by_state <- mutate(by_state, loss_per_vic = loss_per_vic * 1000000)
ggplot(by_state) +
geom_quasirandom(aes(x="", loss_per_vic)) -> gg
gb <- ggplot_build(gg)
as_tibble(gb$data[[1]]) %>%
select(x, y) %>%
left_join(by_state, c("y" = "loss_per_vic")) %>%
rename(loss_per_vic = y) -> gd
ggplot() +
geom_blank(data = gd, aes(x, loss_per_vic)) +
geom_hline(
yintercept = round(median(gd$loss_per_vic)),
linetype = "dotted", color = "#3B454A"
) +
geom_label(
data = data.frame(),
aes(
x = 0.5, y = round(median(gd$loss_per_vic)),
label = sprintf(
"2018 IC3 Median\nPer In-State\nVictim Loss\n($%s USD)",
scales::comma(round(median(gd$loss_per_vic)))
)
), size = 3, family = font_rc, hjust = 0, vjust = 0,
label.size = 0, lineheight = 0.875
) +
geom_point(
data = mutate(gd, color = case_when(
(loss_per_vic >= 10000) ~ "#E85E26",
TRUE ~ "#31739C"
)),
aes(x=x, loss_per_vic, color = I(color))
) +
geom_label_repel(nudge_y = 2500,
data = filter(gd, loss_per_vic >= 10000),
aes(x=x, loss_per_vic, label = state),
size = 3, family = font_rc
) +
scale_x_continuous(expand = c(0,0.125)) +
scale_y_continuous(label = scales::dollar) +
labs(
x = "Victim count", y = "Loss (USD, millions)",
title = "U.S. Virgin Islands Residents Were Hit Hardest\nin IC3 2018 Catalogued Incidents",
subtitle = "Markers only on IC3 states with ≥$10K (USD) losses per in-state victim",
caption = "Source: 2018 FBI IC3; Pages 21-22 '2018 Overall State Statistics'"
) +
theme_ipsum_rc(grid="XY")