diff --git a/solidago/.gitignore b/solidago/.gitignore index 805b520ca3..137fac7124 100644 --- a/solidago/.gitignore +++ b/solidago/.gitignore @@ -1,3 +1,4 @@ +/devenv/ /dist/ **__pycache__/ .flake8/ diff --git a/solidago/src/solidago/pipeline/inputs.py b/solidago/src/solidago/pipeline/inputs.py index 48c93e4c00..9c86361856 100644 --- a/solidago/src/solidago/pipeline/inputs.py +++ b/solidago/src/solidago/pipeline/inputs.py @@ -11,6 +11,18 @@ class TournesolInput(ABC): + """ + An abstract base class for handling input data of Solidago pipeline. + + This class provides an interface for retrieving and processing comparison data, + user ratings, individual scores, and vouches. + + Notes + ----- + This is an abstract base class that must be subclassed and have its abstract + methods implemented to provide concrete data retrieval functionality. + """ + SCALING_CALIBRATION_MIN_TRUST_SCORE = 0.1 MAX_SCALING_CALIBRATION_USERS = 100 @@ -52,9 +64,18 @@ def ratings_properties(self) -> pd.DataFrame: @abstractmethod def get_individual_scores( self, - criteria: Optional[str] = None, user_id: Optional[int] = None, - ) -> Optional[pd.DataFrame]: + criteria: Optional[str] = None, + ) -> pd.DataFrame: + """Fetch data about previously computed individual scores + + Returns: + - DataFrame with columns + * `user_id`: int + * `entity_id`: int + * `criteria`: str + * `score`: float + """ raise NotImplementedError @abstractmethod @@ -70,7 +91,9 @@ def get_vouches(self): raise NotImplementedError def get_users(self): - raise NotImplementedError + users = self.ratings_properties.groupby("user_id").first()[["trust_score"]] + users["is_pretrusted"] = users["trust_score"] >= 0.8 + return users def get_pipeline_kwargs(self, criterion: str): ratings_properties = self.ratings_properties @@ -107,6 +130,26 @@ def get_pipeline_kwargs(self, criterion: str): "judgments": judgments, } + def get_comparisons_counts( + self, criteria: Optional[str] = None, user_id: Optional[int] = None + ): + comparisons = self.get_comparisons(criteria=criteria, user_id=user_id) + return ( + pd.concat( + [ + comparisons[["user_id", "entity_a", "criteria"]].rename( + columns={"entity_a": "entity_id"} + ), + comparisons[["user_id", "entity_b", "criteria"]].rename( + columns={"entity_b": "entity_id"} + ), + ] + ) + .groupby(["user_id", "entity_id", "criteria"]) + .size() + .reset_index(name="n_comparisons") + ) + class TournesolInputFromPublicDataset(TournesolInput): def __init__(self, dataset_zip: Union[str, BinaryIO]): @@ -116,26 +159,6 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]): dataset_zip, _headers = urlretrieve(dataset_zip) # nosec B310 with zipfile.ZipFile(dataset_zip) as zip_file: - with (zipfile.Path(zip_file) / "comparisons.csv").open(mode="rb") as comparison_file: - # keep_default_na=False is required otherwise some public usernames - # such as "NA" are converted to float NaN. - self.comparisons = pd.read_csv(comparison_file, keep_default_na=False) - self.entity_id_to_video_id = pd.Series( - list(set(self.comparisons.video_a) | set(self.comparisons.video_b)), - name="video_id", - ) - video_id_to_entity_id = { - video_id: entity_id - for (entity_id, video_id) in self.entity_id_to_video_id.items() - } - self.comparisons["entity_a"] = self.comparisons["video_a"].map( - video_id_to_entity_id - ) - self.comparisons["entity_b"] = self.comparisons["video_b"].map( - video_id_to_entity_id - ) - self.comparisons.drop(columns=["video_a", "video_b"], inplace=True) - with (zipfile.Path(zip_file) / "users.csv").open(mode="rb") as users_file: # keep_default_na=False is required otherwise some public usernames # such as "NA" are converted to float NaN. @@ -144,25 +167,62 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]): # Fill trust_score on newly created users for which it was not computed yet self.users.trust_score = pd.to_numeric(self.users.trust_score).fillna(0.0) - self.username_to_user_id = pd.Series( - data=self.users.index, index=self.users["public_username"] - ) - self.comparisons = self.comparisons.join(self.username_to_user_id, on="public_username") + with (zipfile.Path(zip_file) / "collective_criteria_scores.csv").open(mode="rb") as collective_scores_file: + # keep_default_na=False is required otherwise some public usernames + # such as "NA" are converted to float NaN. + collective_scores = pd.read_csv(collective_scores_file, keep_default_na=False) - with (zipfile.Path(zip_file) / "vouchers.csv").open(mode="rb") as vouchers_file: + with (zipfile.Path(zip_file) / "comparisons.csv").open(mode="rb") as comparison_file: # keep_default_na=False is required otherwise some public usernames # such as "NA" are converted to float NaN. - self.vouchers = pd.read_csv(vouchers_file, keep_default_na=False) + comparisons = pd.read_csv(comparison_file, keep_default_na=False) - with (zipfile.Path(zip_file) / "collective_criteria_scores.csv").open(mode="rb") as collective_scores_file: + with (zipfile.Path(zip_file) / "vouchers.csv").open(mode="rb") as vouchers_file: # keep_default_na=False is required otherwise some public usernames # such as "NA" are converted to float NaN. - self.collective_scores = pd.read_csv(collective_scores_file, keep_default_na=False) + self.vouchers = pd.read_csv(vouchers_file, keep_default_na=False) - with (zipfile.Path(zip_file) / "individual_criteria_scores.csv").open(mode="rb") as individual_scores_file: + self.username_to_user_id = pd.Series( + data=self.users.index, + index=self.users["public_username"], + ) + + self.entity_id_to_video_id = pd.Series( + sorted( + set(comparisons.video_a) + | set(comparisons.video_b) + | set(collective_scores.video) + ), + name="video_id", + ) + + self.video_id_to_entity_id = { + video_id: entity_id + for (entity_id, video_id) in self.entity_id_to_video_id.items() + } + + # Convert video ids (str) to entity ids (int) + self.collective_scores = collective_scores.assign( + entity_id=collective_scores["video"].map(self.video_id_to_entity_id) + ).drop(columns=["video"]) + + self.comparisons = comparisons.assign( + entity_a=comparisons["video_a"].map(self.video_id_to_entity_id), + entity_b=comparisons["video_b"].map(self.video_id_to_entity_id), + user_id=comparisons["public_username"].map(self.username_to_user_id), + ).drop(columns=["video_a", "video_b"]) + + with (zipfile.Path(zip_file) / "individual_criteria_scores.csv").open( + mode="rb" + ) as individual_scores_file: # keep_default_na=False is required otherwise some public usernames # such as "NA" are converted to float NaN. - self.individual_scores = pd.read_csv(individual_scores_file, keep_default_na=False) + individual_scores = pd.read_csv(individual_scores_file, keep_default_na=False) + # Convert usernames and video_id to user_id and entity_id + self.individual_scores = individual_scores.assign( + entity_id=individual_scores["video"].map(self.video_id_to_entity_id), + user_id=individual_scores["public_username"].map(self.username_to_user_id), + ).drop(columns=["public_username", "video"]) @classmethod def download(cls) -> "TournesolInputFromPublicDataset": @@ -178,7 +238,15 @@ def get_comparisons(self, criteria=None, user_id=None) -> pd.DataFrame: if "score_max" not in dtf: # For compatibility with older datasets dtf["score_max"] = 10 - return dtf[["user_id", "entity_a", "entity_b", "criteria", "score", "score_max", "weight"]] + return dtf[[ + "user_id", + "entity_a", + "entity_b", + "criteria", + "score", + "score_max", + "weight" + ]] @cached_property def ratings_properties(self): @@ -201,11 +269,63 @@ def ratings_properties(self): def get_individual_scores( self, - criteria: Optional[str] = None, user_id: Optional[int] = None, - ) -> Optional[pd.DataFrame]: - # TODO: read contributor scores from individual_scores.csv - return None + criteria: Optional[str] = None, + with_n_comparisons = False, + ) -> pd.DataFrame: + dtf = self.individual_scores + if criteria is not None: + dtf = dtf[dtf.criteria == criteria] + if user_id is not None: + dtf = dtf[dtf.user_id == user_id] + + dtf = dtf[[ + "user_id", + "entity_id", + "criteria", + "score", + "uncertainty", + "voting_right", + ]] + + if with_n_comparisons: + comparison_counts = self.get_comparisons_counts(user_id=user_id, criteria=criteria) + dtf = dtf.merge( + comparison_counts, + how="left", + on=["user_id", "entity_id", "criteria"] + ) + + return dtf + + def get_collective_scores( + self, + entity_id: Optional[str] = None, + criteria: Optional[str] = None, + ) -> pd.DataFrame: + dtf: pd.DataFrame = self.collective_scores + if criteria is not None: + dtf = dtf[dtf["criteria"] == criteria] + if entity_id is not None: + dtf = dtf[dtf["entity_id"] == entity_id] + + counts = ( + self.get_comparisons_counts(criteria=criteria) + .groupby(["criteria", "entity_id"]) + .agg( + n_comparisons=("n_comparisons", "sum"), + n_users=("user_id", "nunique"), + ) + ) + + return ( + dtf.join(counts, how="left", on=["criteria", "entity_id"]) + # Entities that have been compared privately only + # will not appear in comparisons.csv. That's why we need + # to fill for missing values here. + .fillna({"n_comparisons": 0, "n_users": 0}) + .astype({"n_comparisons": "int64", "n_users": "int64"}) + ) def get_vouches(self): vouchers = self.vouchers[ @@ -219,8 +339,3 @@ def get_vouches(self): "vouch": vouchers.value, } ) - - def get_users(self): - users = self.ratings_properties.groupby("user_id").first()[["trust_score"]] - users["is_pretrusted"] = users["trust_score"] >= 0.8 - return users diff --git a/solidago/tests/test_pipeline.py b/solidago/tests/test_pipeline.py index 860077cdc4..12132aafee 100644 --- a/solidago/tests/test_pipeline.py +++ b/solidago/tests/test_pipeline.py @@ -3,8 +3,95 @@ from solidago.pipeline import Pipeline from solidago.pipeline.inputs import TournesolInputFromPublicDataset + @pytest.mark.parametrize("test", range(5)) def test_pipeline_test_data(test): td = import_module(f"data.data_{test}") Pipeline()(td.users, td.vouches, td.entities, td.privacy, td.judgments) + +def test_tournesol_get_comparisons(): + dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip") + + # Test no filter + assert len(dataset.get_comparisons()) == 38387 + + # Test single filter + assert len(dataset.get_comparisons( + criteria="importance" + )) == 17143 + assert len(dataset.get_comparisons( + user_id=dataset.username_to_user_id["le_science4all"] + )) == 5604 + + # Test all filters + assert len(dataset.get_comparisons( + criteria="largely_recommended", + user_id=dataset.username_to_user_id["lpfaucon"] + )) == 8471 + + +def test_tournesol_get_individual_scores(): + dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip") + + # Test no filter + assert len(dataset.get_individual_scores()) == 17319 + + # Test single filter + assert len(dataset.get_individual_scores( + criteria="largely_recommended" + )) == 9176 + assert len(dataset.get_individual_scores( + user_id=dataset.username_to_user_id["aidjango"] + )) == 4379 + + # Test all filters + user_id = dataset.username_to_user_id["le_science4all"] + found = dataset.get_individual_scores( + criteria="importance", + user_id=user_id, + with_n_comparisons=True, + ) + assert len(found) == 1123 + as_dict = found.to_dict(orient="records")[0] + assert as_dict == { + 'user_id': user_id, + 'criteria': 'importance', + 'entity_id': dataset.video_id_to_entity_id["03dTJ4nXkXw"], + 'score': 82.81, + 'uncertainty': 24.37, + 'voting_right': 1.0, + 'n_comparisons': 10, + } + + +def test_tournesol_get_collective_scores(): + dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip") + + # Test no filter + assert len(dataset.get_collective_scores()) == 12184 + + # Test single filter + assert len(dataset.get_collective_scores( + criteria="largely_recommended" + )) == 6227 + assert len(dataset.get_collective_scores( + entity_id=dataset.video_id_to_entity_id["kX3JKg-H5qM"] + )) == 2 + + # Test all filters + entity_id = dataset.video_id_to_entity_id["OlhC6n9Hhac"] + found = dataset.get_collective_scores( + criteria="importance", + entity_id=entity_id + ) + assert len(found) == 1 + as_dict = found.to_dict(orient="records")[0] + assert as_dict == { + 'entity_id': entity_id, + 'criteria': 'importance', + 'score': 18.22, + 'uncertainty': 60.09, + 'n_users': 3, + 'n_comparisons': 12, + }