Skip to content

Commit

Permalink
[solidago] Implement get_individual_scores & get_collective_scores (#…
Browse files Browse the repository at this point in the history
…1994)

* [solidago] Implmented get_individual_scores & get_collective_scores

* [solidago] revert adding new filter entity_id for get_individual_scores

* [solidago] Keep get_collective_scores() specific to the public dataset, abtract comparisons count (#2022)

* Update solidago/src/solidago/pipeline/inputs.py

Co-authored-by: Gresille & Siffle <39056254+GresilleSiffle@users.noreply.github.com>

* remove incomplete __all__ in trust_propagation

* add docstring in TournesolInput

---------

Co-authored-by: Adrien Matissart <amatissart@users.noreply.github.com>
Co-authored-by: Gresille & Siffle <39056254+GresilleSiffle@users.noreply.github.com>
  • Loading branch information
3 people authored Oct 31, 2024
1 parent affdb59 commit 714c581
Show file tree
Hide file tree
Showing 3 changed files with 246 additions and 43 deletions.
1 change: 1 addition & 0 deletions solidago/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/devenv/
/dist/
**__pycache__/
.flake8/
Expand Down
201 changes: 158 additions & 43 deletions solidago/src/solidago/pipeline/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@


class TournesolInput(ABC):
"""
An abstract base class for handling input data of Solidago pipeline.
This class provides an interface for retrieving and processing comparison data,
user ratings, individual scores, and vouches.
Notes
-----
This is an abstract base class that must be subclassed and have its abstract
methods implemented to provide concrete data retrieval functionality.
"""

SCALING_CALIBRATION_MIN_TRUST_SCORE = 0.1
MAX_SCALING_CALIBRATION_USERS = 100

Expand Down Expand Up @@ -52,9 +64,18 @@ def ratings_properties(self) -> pd.DataFrame:
@abstractmethod
def get_individual_scores(
self,
criteria: Optional[str] = None,
user_id: Optional[int] = None,
) -> Optional[pd.DataFrame]:
criteria: Optional[str] = None,
) -> pd.DataFrame:
"""Fetch data about previously computed individual scores
Returns:
- DataFrame with columns
* `user_id`: int
* `entity_id`: int
* `criteria`: str
* `score`: float
"""
raise NotImplementedError

@abstractmethod
Expand All @@ -70,7 +91,9 @@ def get_vouches(self):
raise NotImplementedError

def get_users(self):
raise NotImplementedError
users = self.ratings_properties.groupby("user_id").first()[["trust_score"]]
users["is_pretrusted"] = users["trust_score"] >= 0.8
return users

def get_pipeline_kwargs(self, criterion: str):
ratings_properties = self.ratings_properties
Expand Down Expand Up @@ -107,6 +130,26 @@ def get_pipeline_kwargs(self, criterion: str):
"judgments": judgments,
}

def get_comparisons_counts(
self, criteria: Optional[str] = None, user_id: Optional[int] = None
):
comparisons = self.get_comparisons(criteria=criteria, user_id=user_id)
return (
pd.concat(
[
comparisons[["user_id", "entity_a", "criteria"]].rename(
columns={"entity_a": "entity_id"}
),
comparisons[["user_id", "entity_b", "criteria"]].rename(
columns={"entity_b": "entity_id"}
),
]
)
.groupby(["user_id", "entity_id", "criteria"])
.size()
.reset_index(name="n_comparisons")
)


class TournesolInputFromPublicDataset(TournesolInput):
def __init__(self, dataset_zip: Union[str, BinaryIO]):
Expand All @@ -116,26 +159,6 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]):
dataset_zip, _headers = urlretrieve(dataset_zip) # nosec B310

with zipfile.ZipFile(dataset_zip) as zip_file:
with (zipfile.Path(zip_file) / "comparisons.csv").open(mode="rb") as comparison_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.comparisons = pd.read_csv(comparison_file, keep_default_na=False)
self.entity_id_to_video_id = pd.Series(
list(set(self.comparisons.video_a) | set(self.comparisons.video_b)),
name="video_id",
)
video_id_to_entity_id = {
video_id: entity_id
for (entity_id, video_id) in self.entity_id_to_video_id.items()
}
self.comparisons["entity_a"] = self.comparisons["video_a"].map(
video_id_to_entity_id
)
self.comparisons["entity_b"] = self.comparisons["video_b"].map(
video_id_to_entity_id
)
self.comparisons.drop(columns=["video_a", "video_b"], inplace=True)

with (zipfile.Path(zip_file) / "users.csv").open(mode="rb") as users_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
Expand All @@ -144,25 +167,62 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]):
# Fill trust_score on newly created users for which it was not computed yet
self.users.trust_score = pd.to_numeric(self.users.trust_score).fillna(0.0)

self.username_to_user_id = pd.Series(
data=self.users.index, index=self.users["public_username"]
)
self.comparisons = self.comparisons.join(self.username_to_user_id, on="public_username")
with (zipfile.Path(zip_file) / "collective_criteria_scores.csv").open(mode="rb") as collective_scores_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
collective_scores = pd.read_csv(collective_scores_file, keep_default_na=False)

with (zipfile.Path(zip_file) / "vouchers.csv").open(mode="rb") as vouchers_file:
with (zipfile.Path(zip_file) / "comparisons.csv").open(mode="rb") as comparison_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.vouchers = pd.read_csv(vouchers_file, keep_default_na=False)
comparisons = pd.read_csv(comparison_file, keep_default_na=False)

with (zipfile.Path(zip_file) / "collective_criteria_scores.csv").open(mode="rb") as collective_scores_file:
with (zipfile.Path(zip_file) / "vouchers.csv").open(mode="rb") as vouchers_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.collective_scores = pd.read_csv(collective_scores_file, keep_default_na=False)
self.vouchers = pd.read_csv(vouchers_file, keep_default_na=False)

with (zipfile.Path(zip_file) / "individual_criteria_scores.csv").open(mode="rb") as individual_scores_file:
self.username_to_user_id = pd.Series(
data=self.users.index,
index=self.users["public_username"],
)

self.entity_id_to_video_id = pd.Series(
sorted(
set(comparisons.video_a)
| set(comparisons.video_b)
| set(collective_scores.video)
),
name="video_id",
)

self.video_id_to_entity_id = {
video_id: entity_id
for (entity_id, video_id) in self.entity_id_to_video_id.items()
}

# Convert video ids (str) to entity ids (int)
self.collective_scores = collective_scores.assign(
entity_id=collective_scores["video"].map(self.video_id_to_entity_id)
).drop(columns=["video"])

self.comparisons = comparisons.assign(
entity_a=comparisons["video_a"].map(self.video_id_to_entity_id),
entity_b=comparisons["video_b"].map(self.video_id_to_entity_id),
user_id=comparisons["public_username"].map(self.username_to_user_id),
).drop(columns=["video_a", "video_b"])

with (zipfile.Path(zip_file) / "individual_criteria_scores.csv").open(
mode="rb"
) as individual_scores_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.individual_scores = pd.read_csv(individual_scores_file, keep_default_na=False)
individual_scores = pd.read_csv(individual_scores_file, keep_default_na=False)
# Convert usernames and video_id to user_id and entity_id
self.individual_scores = individual_scores.assign(
entity_id=individual_scores["video"].map(self.video_id_to_entity_id),
user_id=individual_scores["public_username"].map(self.username_to_user_id),
).drop(columns=["public_username", "video"])

@classmethod
def download(cls) -> "TournesolInputFromPublicDataset":
Expand All @@ -178,7 +238,15 @@ def get_comparisons(self, criteria=None, user_id=None) -> pd.DataFrame:
if "score_max" not in dtf:
# For compatibility with older datasets
dtf["score_max"] = 10
return dtf[["user_id", "entity_a", "entity_b", "criteria", "score", "score_max", "weight"]]
return dtf[[
"user_id",
"entity_a",
"entity_b",
"criteria",
"score",
"score_max",
"weight"
]]

@cached_property
def ratings_properties(self):
Expand All @@ -201,11 +269,63 @@ def ratings_properties(self):

def get_individual_scores(
self,
criteria: Optional[str] = None,
user_id: Optional[int] = None,
) -> Optional[pd.DataFrame]:
# TODO: read contributor scores from individual_scores.csv
return None
criteria: Optional[str] = None,
with_n_comparisons = False,
) -> pd.DataFrame:
dtf = self.individual_scores
if criteria is not None:
dtf = dtf[dtf.criteria == criteria]
if user_id is not None:
dtf = dtf[dtf.user_id == user_id]

dtf = dtf[[
"user_id",
"entity_id",
"criteria",
"score",
"uncertainty",
"voting_right",
]]

if with_n_comparisons:
comparison_counts = self.get_comparisons_counts(user_id=user_id, criteria=criteria)
dtf = dtf.merge(
comparison_counts,
how="left",
on=["user_id", "entity_id", "criteria"]
)

return dtf

def get_collective_scores(
self,
entity_id: Optional[str] = None,
criteria: Optional[str] = None,
) -> pd.DataFrame:
dtf: pd.DataFrame = self.collective_scores
if criteria is not None:
dtf = dtf[dtf["criteria"] == criteria]
if entity_id is not None:
dtf = dtf[dtf["entity_id"] == entity_id]

counts = (
self.get_comparisons_counts(criteria=criteria)
.groupby(["criteria", "entity_id"])
.agg(
n_comparisons=("n_comparisons", "sum"),
n_users=("user_id", "nunique"),
)
)

return (
dtf.join(counts, how="left", on=["criteria", "entity_id"])
# Entities that have been compared privately only
# will not appear in comparisons.csv. That's why we need
# to fill for missing values here.
.fillna({"n_comparisons": 0, "n_users": 0})
.astype({"n_comparisons": "int64", "n_users": "int64"})
)

def get_vouches(self):
vouchers = self.vouchers[
Expand All @@ -219,8 +339,3 @@ def get_vouches(self):
"vouch": vouchers.value,
}
)

def get_users(self):
users = self.ratings_properties.groupby("user_id").first()[["trust_score"]]
users["is_pretrusted"] = users["trust_score"] >= 0.8
return users
87 changes: 87 additions & 0 deletions solidago/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,95 @@
from solidago.pipeline import Pipeline
from solidago.pipeline.inputs import TournesolInputFromPublicDataset


@pytest.mark.parametrize("test", range(5))
def test_pipeline_test_data(test):
td = import_module(f"data.data_{test}")
Pipeline()(td.users, td.vouches, td.entities, td.privacy, td.judgments)


def test_tournesol_get_comparisons():
dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip")

# Test no filter
assert len(dataset.get_comparisons()) == 38387

# Test single filter
assert len(dataset.get_comparisons(
criteria="importance"
)) == 17143
assert len(dataset.get_comparisons(
user_id=dataset.username_to_user_id["le_science4all"]
)) == 5604

# Test all filters
assert len(dataset.get_comparisons(
criteria="largely_recommended",
user_id=dataset.username_to_user_id["lpfaucon"]
)) == 8471


def test_tournesol_get_individual_scores():
dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip")

# Test no filter
assert len(dataset.get_individual_scores()) == 17319

# Test single filter
assert len(dataset.get_individual_scores(
criteria="largely_recommended"
)) == 9176
assert len(dataset.get_individual_scores(
user_id=dataset.username_to_user_id["aidjango"]
)) == 4379

# Test all filters
user_id = dataset.username_to_user_id["le_science4all"]
found = dataset.get_individual_scores(
criteria="importance",
user_id=user_id,
with_n_comparisons=True,
)
assert len(found) == 1123
as_dict = found.to_dict(orient="records")[0]
assert as_dict == {
'user_id': user_id,
'criteria': 'importance',
'entity_id': dataset.video_id_to_entity_id["03dTJ4nXkXw"],
'score': 82.81,
'uncertainty': 24.37,
'voting_right': 1.0,
'n_comparisons': 10,
}


def test_tournesol_get_collective_scores():
dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip")

# Test no filter
assert len(dataset.get_collective_scores()) == 12184

# Test single filter
assert len(dataset.get_collective_scores(
criteria="largely_recommended"
)) == 6227
assert len(dataset.get_collective_scores(
entity_id=dataset.video_id_to_entity_id["kX3JKg-H5qM"]
)) == 2

# Test all filters
entity_id = dataset.video_id_to_entity_id["OlhC6n9Hhac"]
found = dataset.get_collective_scores(
criteria="importance",
entity_id=entity_id
)
assert len(found) == 1
as_dict = found.to_dict(orient="records")[0]
assert as_dict == {
'entity_id': entity_id,
'criteria': 'importance',
'score': 18.22,
'uncertainty': 60.09,
'n_users': 3,
'n_comparisons': 12,
}

0 comments on commit 714c581

Please sign in to comment.