Skip to content

Commit

Permalink
Implemented trawling for nutritional facts from website
Browse files Browse the repository at this point in the history
  • Loading branch information
uglrl committed Nov 23, 2023
1 parent 1c92471 commit 87f8a3c
Show file tree
Hide file tree
Showing 9 changed files with 9,409 additions and 641 deletions.
21 changes: 21 additions & 0 deletions backend/src/interface/mensa_parser/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,25 @@ pub struct Dish {
pub meal_type: MealType,
/// The environmental score of the dish, which is an integer between 0 and 3. (Higher is better) 0 indicates that no score was present.
pub env_score: u32,
/// The nutritional information of the dish
pub nutrition_data: Option<NutritionData>,
}

/// The nutrients of a dish
#[derive(Debug)]
pub struct NutritionData {
/// Energy in Kcal
pub energy: u32,
/// Protein in grams
pub protein: u32,
/// Carbs in grams
pub carbohydrates: u32,
/// Sugar in grams
pub sugar: u32,
/// Fat in grams
pub fat: u32,
/// Saturated fat in grams
pub saturated_fat: u32,
/// Salt in grams
pub salt: u32,
}
47 changes: 45 additions & 2 deletions backend/src/layer/data/swka_parser/html_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
//! ```

use crate::interface::mensa_parser::{
model::{Dish, ParseCanteen, ParseLine},
model::{Dish, NutritionData, ParseCanteen, ParseLine},
ParseError,
};
use crate::util::{Additive, Allergen, Date, MealType, Price};
Expand Down Expand Up @@ -117,6 +117,12 @@ lazy_static! {
static ref ALLERGEN_REGEX: Regex = Regex::new(r"[A-Z]\w+").expect(REGEX_PARSE_E_MSG);
/// A regex for getting additives. An additive consists of one or two digits.
static ref ADDITIVE_REGEX: Regex = Regex::new(r"[0-9]{1,2}").expect(REGEX_PARSE_E_MSG);

static ref ENERGY_REGEX: Regex = Regex::new(r"([1-9][0-9]*) kcal").expect(REGEX_PARSE_E_MSG);

static ref WEIGHT_REGEX: Regex = Regex::new(r"([1-9][0-9]*) g").expect(REGEX_PARSE_E_MSG);

static ref ID_REGEX: Regex = Regex::new(r"[0-9]{18,}").expect(REGEX_PARSE_E_MSG);
}

const DISH_NODE_CLASS_SELECTOR_PREFIX: &str = "tr.mt-";
Expand Down Expand Up @@ -296,6 +302,7 @@ impl HTMLParser {
additives: Self::get_dish_additives(dish_node).unwrap_or_default(),
meal_type: Self::get_dish_type(dish_node).unwrap_or(MealType::Unknown),
env_score: Self::get_dish_env_score(dish_node).unwrap_or_default(),
nutrition_data: Self::get_dish_nutrition_data(dish_node),
})
}

Expand Down Expand Up @@ -385,6 +392,42 @@ impl HTMLParser {
.parse::<u32>()
.ok()
}

fn get_dish_nutrition_data(dish_node: &ElementRef) -> Option<NutritionData> {
let nutrition_node = Self::get_nutrition_node(dish_node)?;
Some(NutritionData {
energy: Self::get_nutrients(&nutrition_node, "energie", &ENERGY_REGEX)?,
protein: Self::get_nutrients(&nutrition_node, "proteine", &WEIGHT_REGEX)?,
carbohydrates: Self::get_nutrients(&nutrition_node, "kohlenhydrate", &WEIGHT_REGEX)?,
sugar: Self::get_nutrients(&nutrition_node, "zucker", &WEIGHT_REGEX)?,
fat: Self::get_nutrients(&nutrition_node, "fett", &WEIGHT_REGEX)?,
saturated_fat: Self::get_nutrients(&nutrition_node, "gesaettigt", &WEIGHT_REGEX)?,
salt: Self::get_nutrients(&nutrition_node, "salz", &WEIGHT_REGEX)?,
})
}

fn get_nutrition_node<'a>(dish_node: &'a ElementRef<'a>) -> Option<ElementRef<'a>> {
let meal_id = Self::get_meal_id(dish_node)?;
let string = format!("td.nutrition_facts_row.co2_id-{meal_id}");
let selector = Selector::parse(&string).ok()?;
let node = ElementRef::wrap(dish_node.parent()?)?;
node.select(&selector).next()
}

fn get_meal_id(dish_node: &ElementRef) -> Option<String> {
Some(ID_REGEX.find(&dish_node.html())?.as_str().to_string())
}

fn get_nutrients(nutrition_node: &ElementRef, name: &str, regex: &Regex) -> Option<u32> {
let selector = Selector::parse(&format!("div.{name}")).ok()?;
let node = nutrition_node.select(&selector).next()?;
regex
.captures(&node.inner_html())?
.get(1)?
.as_str()
.parse()
.ok()
}
}

#[cfg(test)]
Expand Down Expand Up @@ -458,7 +501,7 @@ mod tests {
let file_contents = read_from_file(path).unwrap();
let canteen_data = HTMLParser::new().transform(&file_contents, 42_u32).unwrap();

//write_output_to_file(path, &canteen_data);
let _ = write_output_to_file(path, &canteen_data);
let expected = read_from_file(&path.replace(".html", ".txt"))
.unwrap()
.replace("\r\n", "\n");
Expand Down
2 changes: 2 additions & 0 deletions backend/src/layer/data/swka_parser/test_data/test_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
additives: [],
meal_type: Unknown,
env_score: 0,
nutrition_data: None,
},
],
pos: 0,
Expand Down Expand Up @@ -47,6 +48,7 @@
additives: [],
meal_type: Unknown,
env_score: 0,
nutrition_data: None,
},
],
pos: 0,
Expand Down
8,853 changes: 8,214 additions & 639 deletions backend/src/layer/data/swka_parser/test_data/test_mensa_moltke.html

Large diffs are not rendered by default.

Loading

0 comments on commit 87f8a3c

Please sign in to comment.