2018 FBI IC3 PDF Data Wrangling

library(readxl)
library(ggbeeswarm)
library(pdftools)
library(stringi)
library(hrbrthemes)
library(ggrepel)
library(tidyverse)

tibble(
  year = 2014:2018,
  complaints = c(269422, 288012, 298728, 301580, 351937),
  losses = c(800.5, 1070.7, 1450.7, 1418.7, 2706.4)
) -> ic3_summary

write_csv(ic3_summary, here::here("data/2018-fbi-ic3-annual-summary.csv"))

ggplot(ic3_summary, aes(complaints, losses)) +
  geom_path(
    arrow = arrow(type = "closed", length = unit(12, "pt")),
    color = "#31739C"
  ) +
  geom_point(color = "#31739C") +
  geom_label_repel(
    aes(label = year), family = font_rc, size = c(rep(3, 4), 4),
    color = c(rep("#3B454A", 4), "black"),
    fontface = c(rep("plain", 4), "bold")
  ) +
  scale_x_comma(limits = c(0, NA)) +
  scale_y_continuous(label = scales::dollar, limits = c(0, NA)) +
  labs(
    x = "Number of Complaints", y = "Losses (USD, millions)",
    title = "Both Incident Count and Total Losses Related to Cybercrime\nSkyrocketed in the 2018 Edition of the FBI IC3 Report",
    subtitle = "Zero baseline; Point labels denote IC3 summary year data",
    caption = "Source: 2018 FBI IC3; Page 5 'IC3 Complaint Statistics 2014-2018'"
  ) +
  theme_ipsum_rc()

ic3 <- pdf_text(here::here("raw/2018_IC3Report.pdf"))

ic3[[16]] %>% 
  stri_split_lines() %>% 
  unlist() %>% 
  stri_trim_both() -> l

l[which(stri_detect_regex(l, "^Under")):which(stri_detect_regex(l, "^Over 6"))] %>% 
  stri_split_regex("[[:space:]]{3,}", simplify = TRUE) %>% 
  as.data.frame(stringsAsFactors = FALSE) %>% 
  set_names("age_group", "incidents", "losses") %>% 
  as_tibble() %>% 
  mutate(losses = stri_replace_first_fixed(losses, "$", "")) %>% 
  type_convert(
    col_types = cols(
      age_group = col_character(),
      incidents = col_number(),
      losses = col_number()
    )
  ) -> loss

write_csv(loss, here::here("data/2018-fbi-ic3-loss-by-age.csv"))

mutate(loss, `Loss Ratio (USD, milions)` = losses/incidents) %>% 
  rename(
    `Total Losses (USD, milions)` = losses,
    `Total Incidents` = incidents
  ) %>% 
  mutate(age_group = stri_replace_first_fixed(age_group, " ", "\n")) %>% 
  mutate(age_group = factor(age_group, age_group)) %>% 
  gather(measure, value, -age_group) %>% 
  mutate(
    measure = factor(measure, levels = c(
      "Total Incidents", "Total Losses (USD, milions)", "Loss Ratio (USD, milions)"))
  ) %>% 
  ggplot(aes(age_group, value)) +
  geom_col(width=0.45, fill = "#31739C") +
  scale_x_discrete() +
  scale_y_comma() +
  facet_wrap(~measure, scales = "free") +
  labs(
    x = NULL, y = "Loss Ratio (total losses/victim count)",
    title = "In 2018, Older Victims Generally Lost More Overall and Per-Incident Than Younger Victims",
    subtitle = "Note that 40-49 age group had more incients than older groups but fewer overall losses.",
    caption = "NOTE: Free Y Scale\nSource: 2018 FBI IC3; Page 16 '2018 Victims by Age Group'"
  ) +
  theme_ipsum_rc(grid="Y")

ic3[[19]] %>% 
  stri_split_lines() %>% 
  unlist() %>% 
  keep(stri_detect_regex, "^[[:upper:]]") %>% 
  keep(stri_detect_regex, "[[:digit:]]") %>% 
  stri_replace_first_regex("([[:digit:]])   ([[:alpha:]\\*])", "$1\t$2") %>% 
  stri_split_fixed("\t") %>% 
  unlist() %>% 
  keep(stri_detect_regex, " [[:digit:]]") %>% 
  stri_match_first_regex("([^[:digit:]]+)([[:digit:],]+)$") %>% 
  .[,2:3] %>% 
  as.data.frame(stringsAsFactors=FALSE) %>% 
  as_tibble() %>% 
  mutate_all(.funs = stri_trim_both) %>% 
  type_convert(
    col_types = cols(
      V1 = col_character(),
      V2 = col_number()
    )
  ) %>% 
  mutate(V1 = case_when(
    stri_detect_fixed(V1, "IPR/Copy") ~ "IPR/Copyright and Counterfeit",
    TRUE ~ V1
  )) %>% 
  set_names(c("crime", "victim_count")) %>% 
  head(-2) %>% 
  arrange(desc(victim_count)) -> victims

ic3[[20]] %>% 
  stri_split_lines() %>% 
  unlist() %>% 
  keep(stri_detect_regex, "^    [[:upper:]]") %>% 
  keep(stri_detect_regex, "[[:digit:]]") %>% 
  stri_trim_both() %>% 
  stri_replace_first_regex("([[:digit:]])   ([[:alpha:]\\*])", "$1\t$2") %>% 
  stri_split_fixed("\t") %>% 
  unlist() %>% 
  stri_replace_first_regex("([[:digit:]])[[:space:]]+([[:alpha:]\\*])", "$1\t$2") %>% 
  stri_split_fixed("\t") %>%  
  unlist() %>% 
  keep(stri_detect_regex, "\\$[[:digit:]]") %>% 
  stri_match_first_regex("([^\\$]+)([\\$[:digit:],\\.]+)$") %>% 
  .[,2:3] %>% 
  as.data.frame(stringsAsFactors=FALSE) %>% 
  as_tibble() %>% 
  mutate_all(.funs = stri_trim_both) %>% 
  mutate(V2 = stri_replace_first_fixed(V2, "$", "")) %>% 
  type_convert(
    col_types = cols(
      V1 = col_character(),
      V2 = col_number()
    )
  ) %>% 
  mutate(V1 = case_when(
    stri_detect_fixed(V1, "IPR/Copy") ~ "IPR/Copyright and Counterfeit",
    stri_detect_fixed(V1, "Malware/Sca") ~ "Malware/Scareware/Virus",
    stri_detect_fixed(V1, "Harassment/T") ~ "Harassment/Threats of Violence",
    stri_detect_fixed(V1, "Ransomware") ~ "Ransomware",
    stri_detect_fixed(V1, "Denial of Service") ~ "Denial of Service/TDoS",
    stri_detect_fixed(V1, "Re-Shipping") ~ "Re-shipping",
    TRUE ~ V1
  )) %>% 
  set_names(c("crime", "loss")) %>% 
  head(-2) %>% 
  left_join(victims, "crime") %>% 
  mutate(loss = loss / 1000000) -> crime_types

write_csv(crime_types, here::here("data/2018-fbi-ic3-loss-by-crime-type.csv"))

ggplot() +
  geom_point(
    data = mutate(crime_types, color = case_when(
      (loss >= 100) | (victim_count >= 20000) ~ "#E85E26",
      TRUE ~ "#31739C"
    )), 
    aes(victim_count, loss, color = I(color))
  ) +
  geom_label_repel(
    data = filter(crime_types, (loss >= 100) | (victim_count >= 20000)),
    aes(victim_count, loss, label = crime),
    size = 3, family = font_rc
  ) +
  scale_x_comma() +
  scale_y_continuous(label = scales::dollar) +
  labs(
    x = "Victim count", y = "Loss (USD, millions)",
    title = "[Business] E-mail Account Compromise was the Most Profitable\nIC3 Crime in 2018 with over $1.2 billion (USD) in Losses",
    subtitle = "Markers only on IC3 crimes with ≥$100m (USD) losses or ≥20,000 victims ",
    caption = "Source: 2018 FBI IC3; Pages 19-20 '2018 Crime Types'"
  ) +
  theme_ipsum_rc(grid="XY")

arrange(crime_types, desc(loss)) %>% 
  select(`Crime` = 1, `Loss (USD, millions)` = 2, `Victim Count` = 3) %>% 
  gt::gt() %>% 
  gt::fmt_number("Victim Count", decimals = 0) %>% 
  gt::fmt_currency("Loss (USD, millions)", decimals = 2)

Crime	Loss (USD, millions)	Victim Count
BEC/EAC	$1,297.80	20,373
Confidence Fraud/Romance	$362.50	18,493
Investment	$252.96	3,693
Non-Payment/Non-Delivery	$183.83	65,116
Real Estate/Rental	$149.46	11,300
Personal Data Breach	$148.89	50,642
Corporate Data Breach	$117.71	2,480
Identity Theft	$100.43	16,128
Advanced Fee	$92.27	16,362
Credit Card Fraud	$88.99	15,210
Extortion	$83.36	51,146
Spoofing	$70.00	15,569
Government Impersonation	$64.21	10,978
Other	$63.13	10,826
Lottery/Sweepstakes	$60.21	7,146
Overpayment	$53.23	15,512
Phishing/Vishing/Smishing/Pharming	$48.24	26,379
Employment	$45.49	14,979
Tech Support	$38.70	14,408
Harassment/Threats of Violence	$21.90	18,415
Misrepresentation	$20.00	5,959
IPR/Copyright and Counterfeit	$15.80	2,249
Civil Matter	$15.17	768
Malware/Scareware/Virus	$7.41	2,811
Health Care Related	$4.47	337
Ransomware	$3.62	1,493
Denial of Service/TDoS	$2.05	1,799
Re-shipping	$1.68	907
Charity	$1.01	493
Gambling	$0.93	181
Crimes Against Children	$0.27	1,394
Hacktivist	$0.08	77
Terrorism	$0.01	120
No Lead Value	$0.00	36,936

ic3[[21]] %>%
  stri_split_lines() %>% 
  unlist() %>% 
  keep(stri_detect_regex, "^[[:digit:]]") %>% 
  stri_replace_first_regex("([[:digit:],]+)[[:space:]]+([[:digit:]])", "$1\t$2") %>% 
  stri_split_fixed("\t") %>% 
  unlist() %>% 
  stri_replace_first_regex("^[[:digit:] ]+", "") %>% 
  stri_replace_first_regex("[[:space:]]+([[:digit:]])", "\t$1") %>% 
  stri_split_fixed("\t", simplify = TRUE) %>% 
  as.data.frame(stringsAsFactors=FALSE) %>% 
  as_tibble() %>% 
  mutate_all(.funs = stri_trim_both) %>% 
  mutate(V1 = case_when(
    stri_detect_fixed(V1, "Northern Marina") ~ "Northern Mariana Islands",
    TRUE ~ V1
  )) %>% 
  type_convert(
    col_types = cols(
      V1 = col_character(),
      V2 = col_number()
    )
  ) %>% 
  set_names(c("state", "victim_count")) -> state_vics

ic3[[23]] %>%
  stri_split_lines() %>% 
  unlist() %>% 
  keep(stri_detect_regex, "^[[:space:]]+[[:digit:]]") %>% 
  stri_replace_first_regex("([[:digit:],]+)[[:space:]]+([[:digit:]])", "$1\t$2") %>% 
  stri_split_fixed("\t") %>% 
  unlist() %>% 
  stri_replace_first_regex("^[[:digit:] ]+", "") %>% 
  stri_replace_first_regex("[[:space:]]+([[:digit:]])", "\t$1") %>% 
  stri_split_fixed("\t", simplify = TRUE) %>% 
  as.data.frame(stringsAsFactors=FALSE) %>% 
  as_tibble() %>% 
  mutate_all(.funs = stri_trim_both) %>% 
  mutate(V1 = case_when(
    stri_detect_fixed(V1, "Northern Marina") ~ "Northern Mariana Islands",
    TRUE ~ V1
  )) %>% 
  type_convert(
    col_types = cols(
      V1 = col_character(),
      V2 = col_number()
    )
  ) %>% 
  set_names(c("state", "subject_earnings")) -> subj_earnings

ic3[[22]] %>%
  stri_split_lines() %>% 
  unlist() %>% 
  keep(stri_detect_regex, "^    [[:digit:]]") %>% 
  stri_trim_both() %>% 
  stri_replace_first_regex("([[:digit:],]+)[[:space:]]+([[:digit:]])", "$1\t$2") %>% 
  stri_split_fixed("\t") %>% 
  unlist() %>% 
  stri_replace_first_regex("^[[:digit:] ]+", "") %>% 
  stri_replace_first_regex("[[:space:]]+\\$([[:digit:]])", "\t$1") %>% 
  stri_split_fixed("\t", simplify = TRUE) %>% 
  as.data.frame(stringsAsFactors=FALSE) %>% 
  as_tibble() %>%
  mutate_all(.funs = stri_trim_both) %>% 
  type_convert(
    col_types = cols(
      V1 = col_character(),
      V2 = col_number()
    )
  ) %>%  
  set_names(c("state", "loss")) %>% 
  left_join(subj_earnings, "state") %>% 
  left_join(state_vics, "state") %>% 
  mutate(subject_earnings = subject_earnings / 1000000) %>% 
  mutate(loss = loss / 1000000) -> by_state

if (!file.exists(here::here("raw/2018-pop-est.xlsx"))) {
  download.file(
    url = "https://www2.census.gov/programs-surveys/popest/tables/2010-2018/national/totals/nst-est2018-01.xlsx", 
    destfile = here::here("raw/2018-pop-est.xlsx")
  )
}

read_excel(here::here("raw/2018-pop-est.xlsx"), skip=9, col_names = FALSE) %>% 
  select(state = 1, pop_2018 = 12) %>% 
  mutate(state = stri_replace_first_fixed(state, ".", "")) %>%
  filter(!is.na(state), !is.na(pop_2018)) %>% 
  add_row(state = "U.S. Virgin Islands", pop_2018 = 104914) %>% 
  add_row(state = "Guam", pop_2018 = 165718) %>% 
  add_row(state = "U.S. Minor Outlying Islands", pop_2018 = 270) %>% 
  add_row(state = "American Samoa", pop_2018 = 55679) %>% 
  add_row(state = "Northern Mariana Islands", pop_2018 = 55194) -> pops

left_join(by_state, pops, "state")  %>% 
  mutate(
    loss_per_vic = loss/victim_count,
    frac = victim_count / pop_2018
  ) -> by_state

write_csv(by_state, here::here("data/2018-fbi-ic3-loss-by-state.csv"))

arrange(by_state, desc(frac)) %>%
  mutate(loss_per_vic = loss_per_vic * 1000000) %>% 
  select(
    `State` = 1, 
    `Victim Count` = 4, 
    `Loss (USD, millions)` = 3, 
    `Loss per Victim (USD)` = 6, 
    `% Population Impacted` = 7,
    `Subject Earnings (USD, millions)` = 2
  ) %>% 
  gt::gt() %>% 
  gt::fmt_number("Victim Count", decimals = 0) %>% 
  gt::fmt_currency("Loss (USD, millions)", decimals = 2) %>% 
  gt::fmt_currency("Loss per Victim (USD)", decimals = 2) %>% 
  gt::fmt_percent("% Population Impacted", decimals = 4) %>% 
  gt::fmt_currency("Subject Earnings (USD, millions)", decimals = 2)

State	Victim Count	Loss (USD, millions)	Loss per Victim (USD)	% Population Impacted	Subject Earnings (USD, millions)
U.S. Minor Outlying Islands	47	$0.00	$2,049.91	17.4074%	$0.10
Alaska	1,603	$0.00	$2,256.30	0.2174%	$3.62
District of Columbia	1,364	$0.00	$6,524.80	0.1942%	$8.90
Virginia	14,800	$0.01	$2,958.95	0.1738%	$43.79
Nevada	5,228	$0.00	$5,531.93	0.1723%	$28.92
Colorado	9,328	$0.00	$3,653.82	0.1638%	$34.08
Maryland	8,777	$0.00	$5,375.44	0.1452%	$47.18
Washington	10,775	$0.00	$5,616.07	0.1430%	$60.51
California	49,031	$0.02	$9,187.70	0.1240%	$450.48
Wisconsin	6,621	$0.00	$3,722.89	0.1139%	$24.65
Florida	23,984	$0.01	$7,427.51	0.1126%	$178.14
Arizona	8,027	$0.00	$5,626.77	0.1119%	$45.17
Oregon	4,511	$0.00	$6,340.05	0.1076%	$28.60
New Mexico	2,127	$0.00	$4,051.61	0.1015%	$8.62
Rhode Island	1,028	$0.00	$6,740.27	0.0972%	$6.93
Utah	3,041	$0.00	$6,779.82	0.0962%	$20.62
New Jersey	8,440	$0.00	$9,444.52	0.0947%	$79.71
Alabama	4,585	$0.00	$3,688.35	0.0938%	$16.91
Delaware	897	$0.00	$3,502.11	0.0927%	$3.14
New York	18,124	$0.01	$11,095.24	0.0927%	$201.09
Missouri	5,508	$0.00	$4,643.74	0.0899%	$25.58
Massachusetts	6,173	$0.00	$11,054.95	0.0894%	$68.24
Texas	25,589	$0.01	$7,644.34	0.0892%	$195.61
Connecticut	3,134	$0.00	$12,080.38	0.0877%	$37.86
Georgia	9,095	$0.00	$6,758.33	0.0865%	$61.47
Idaho	1,513	$0.00	$4,529.54	0.0862%	$6.85
Wyoming	497	$0.00	$9,088.79	0.0860%	$4.52
Vermont	525	$0.00	$4,052.03	0.0838%	$2.13
Tennessee	5,584	$0.00	$5,120.06	0.0825%	$28.59
Pennsylvania	10,554	$0.00	$5,940.19	0.0824%	$62.69
Illinois	10,087	$0.00	$8,213.52	0.0792%	$82.85
New Hampshire	1,056	$0.00	$5,761.96	0.0778%	$6.08
Hawaii	1,100	$0.00	$5,873.44	0.0774%	$6.46
Minnesota	4,304	$0.00	$11,341.56	0.0767%	$48.81
Michigan	7,533	$0.00	$10,743.37	0.0754%	$80.93
Louisiana	3,469	$0.00	$4,726.51	0.0744%	$16.40
Montana	787	$0.00	$8,401.60	0.0741%	$6.61
North Carolina	7,523	$0.00	$18,241.52	0.0725%	$137.23
Kansas	2,098	$0.00	$8,329.25	0.0721%	$17.47
South Carolina	3,575	$0.00	$5,473.54	0.0703%	$19.57
Indiana	4,676	$0.00	$6,325.43	0.0699%	$29.58
Oklahoma	2,644	$0.00	$4,382.72	0.0671%	$11.59
Ohio	7,812	$0.00	$12,510.25	0.0668%	$97.73
Mississippi	1,882	$0.00	$3,041.99	0.0630%	$5.73
Kentucky	2,813	$0.00	$3,324.84	0.0630%	$9.35
Iowa	1,983	$0.00	$7,734.73	0.0628%	$15.34
Nebraska	1,205	$0.00	$7,822.97	0.0625%	$9.43
Maine	832	$0.00	$3,244.89	0.0622%	$2.70
U.S. Virgin Islands	65	$0.00	$41,904.46	0.0620%	$2.72
West Virginia	1,109	$0.00	$7,483.10	0.0614%	$8.30
Arkansas	1,849	$0.00	$3,770.43	0.0614%	$6.97
North Dakota	459	$0.00	$5,003.90	0.0604%	$2.30
South Dakota	465	$0.00	$3,728.66	0.0527%	$1.73
Guam	52	$0.00	$2,981.83	0.0314%	$0.16
American Samoa	16	$0.00	$1,158.56	0.0287%	$0.02
Northern Mariana Islands	15	$0.00	$924.33	0.0272%	$0.01
Puerto Rico	704	$0.00	$7,413.48	0.0220%	$5.22

by_state <- mutate(by_state, loss_per_vic = loss_per_vic * 1000000)

ggplot(by_state) +
  geom_quasirandom(aes(x="", loss_per_vic)) -> gg

gb <- ggplot_build(gg)

as_tibble(gb$data[[1]]) %>% 
  select(x, y) %>% 
  left_join(by_state, c("y" = "loss_per_vic")) %>% 
  rename(loss_per_vic = y) -> gd

ggplot() +
  geom_blank(data = gd, aes(x, loss_per_vic)) +
  geom_hline(
    yintercept = round(median(gd$loss_per_vic)),
    linetype = "dotted", color = "#3B454A"
  ) +
  geom_label(
    data = data.frame(),
    aes(
      x = 0.5, y = round(median(gd$loss_per_vic)),
      label = sprintf(
        "2018 IC3 Median\nPer In-State\nVictim Loss\n($%s USD)", 
        scales::comma(round(median(gd$loss_per_vic)))
      )
    ), size = 3, family = font_rc, hjust = 0, vjust = 0,
    label.size = 0, lineheight = 0.875
  ) +
  geom_point(
    data = mutate(gd, color = case_when(
      (loss_per_vic >= 10000) ~ "#E85E26",
      TRUE ~ "#31739C"
    )), 
    aes(x=x, loss_per_vic, color = I(color))
  ) +
  geom_label_repel(nudge_y = 2500,
                   data = filter(gd, loss_per_vic >= 10000),
                   aes(x=x, loss_per_vic, label = state),
                   size = 3, family = font_rc
  ) +
  scale_x_continuous(expand = c(0,0.125)) +
  scale_y_continuous(label = scales::dollar) +
  labs(
    x = "Victim count", y = "Loss (USD, millions)",
    title = "U.S. Virgin Islands Residents Were Hit Hardest\nin IC3 2018 Catalogued Incidents",
    subtitle = "Markers only on IC3 states with ≥$10K (USD) losses per in-state victim",
    caption = "Source: 2018 FBI IC3; Pages 21-22 '2018 Overall State Statistics'"
  ) +
  theme_ipsum_rc(grid="XY")

Name		Name	Last commit message	Last commit date
Latest commit History 3 Commits
data		data
raw		raw
wrangle_files/figure-gfm		wrangle_files/figure-gfm
.gitignore		.gitignore
2018-fbi-ic3.Rproj		2018-fbi-ic3.Rproj
README.md		README.md
wrangle.Rmd		wrangle.Rmd
wrangle.md		wrangle.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

2018 FBI IC3 PDF Data Wrangling

About

Releases

Packages

hrbrmstr/fbi-2018-ic3

Folders and files

Latest commit

History

Repository files navigation

2018 FBI IC3 PDF Data Wrangling

About

Topics

Resources

Stars

Watchers

Forks

Releases

Packages 0

Packages