Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[solidago] Implement get_individual_scores & get_collective_scores #1994

Merged
merged 25 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
72af6d9
Added import for vouchers and scores in pipline/inputs
lenhoanglnh May 12, 2024
df57fbf
Important change: Modified qr_quantile using asymmetric Huber rather …
lenhoanglnh May 15, 2024
9f0ddb4
cleanup docstrings in Solidago (wip)
amatissart May 2, 2024
fd1fb49
implement 'get_pipeline_kwargs' in TournesolInput
amatissart May 12, 2024
049d72e
fix experiments script
amatissart May 16, 2024
dde4c9f
read vouches in TournesolInput
amatissart May 16, 2024
82e9c4f
[solidago] gbt: estimate asymmetrical uncertainties based on increase…
amatissart Jun 1, 2024
c58e424
cleanup docstrings in Solidago (wip)
amatissart May 2, 2024
5e6d598
implement 'get_pipeline_kwargs' in TournesolInput
amatissart May 12, 2024
051f088
fix experiments script
amatissart May 16, 2024
3483609
read vouches in TournesolInput
amatissart May 16, 2024
498f4a3
Fixed experiments calls to Tournesol inputs API
lenhoanglnh Jun 1, 2024
fde2a83
Merge branch 'solidago-pipeline-docs-1' of github.com:tournesol-app/t…
lenhoanglnh Jun 1, 2024
afc32d4
fix docstring
amatissart Jun 1, 2024
0032c86
Merge pull request #1971 from tournesol-app/solidago-pipeline-docs-1
amatissart Jun 3, 2024
a2dfbaa
fix numerical issues in gbt implementations
amatissart Jul 4, 2024
39c5652
normalize weight per user in Standardize
amatissart Jul 4, 2024
fdd40f3
normalize weight per user in QuantileZeroShift
amatissart Jul 4, 2024
721ed6c
[solidago] Implmented get_individual_scores & get_collective_scores
NatNgs Jul 4, 2024
6dcea8f
Merge branch 'main' of https://github.com/tournesol-app/tournesol int…
NatNgs Oct 24, 2024
88e769e
[solidago] revert adding new filter entity_id for get_individual_scores
NatNgs Oct 24, 2024
7f3542d
[solidago] Keep get_collective_scores() specific to the public datase…
amatissart Oct 31, 2024
44d298b
Update solidago/src/solidago/pipeline/inputs.py
amatissart Oct 31, 2024
5f67fcd
remove incomplete __all__ in trust_propagation
amatissart Oct 31, 2024
06c0499
add docstring in TournesolInput
amatissart Oct 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions solidago/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/devenv/
/dist/
**__pycache__/
.flake8/
Expand Down
201 changes: 158 additions & 43 deletions solidago/src/solidago/pipeline/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@


class TournesolInput(ABC):
"""
An abstract base class for handling input data of Solidago pipeline.

This class provides an interface for retrieving and processing comparison data,
user ratings, individual scores, and vouches.

Notes
-----
This is an abstract base class that must be subclassed and have its abstract
methods implemented to provide concrete data retrieval functionality.
"""

SCALING_CALIBRATION_MIN_TRUST_SCORE = 0.1
MAX_SCALING_CALIBRATION_USERS = 100

Expand Down Expand Up @@ -52,9 +64,18 @@ def ratings_properties(self) -> pd.DataFrame:
@abstractmethod
def get_individual_scores(
self,
criteria: Optional[str] = None,
user_id: Optional[int] = None,
) -> Optional[pd.DataFrame]:
criteria: Optional[str] = None,
) -> pd.DataFrame:
"""Fetch data about previously computed individual scores

Returns:
- DataFrame with columns
* `user_id`: int
* `entity_id`: int
* `criteria`: str
* `score`: float
"""
raise NotImplementedError

@abstractmethod
Expand All @@ -70,7 +91,9 @@ def get_vouches(self):
raise NotImplementedError

def get_users(self):
raise NotImplementedError
users = self.ratings_properties.groupby("user_id").first()[["trust_score"]]
users["is_pretrusted"] = users["trust_score"] >= 0.8
return users

def get_pipeline_kwargs(self, criterion: str):
ratings_properties = self.ratings_properties
Expand Down Expand Up @@ -107,6 +130,26 @@ def get_pipeline_kwargs(self, criterion: str):
"judgments": judgments,
}

def get_comparisons_counts(
self, criteria: Optional[str] = None, user_id: Optional[int] = None
):
comparisons = self.get_comparisons(criteria=criteria, user_id=user_id)
return (
pd.concat(
[
comparisons[["user_id", "entity_a", "criteria"]].rename(
columns={"entity_a": "entity_id"}
),
comparisons[["user_id", "entity_b", "criteria"]].rename(
columns={"entity_b": "entity_id"}
),
]
)
.groupby(["user_id", "entity_id", "criteria"])
.size()
.reset_index(name="n_comparisons")
)


class TournesolInputFromPublicDataset(TournesolInput):
def __init__(self, dataset_zip: Union[str, BinaryIO]):
Expand All @@ -116,26 +159,6 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]):
dataset_zip, _headers = urlretrieve(dataset_zip) # nosec B310

with zipfile.ZipFile(dataset_zip) as zip_file:
with (zipfile.Path(zip_file) / "comparisons.csv").open(mode="rb") as comparison_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.comparisons = pd.read_csv(comparison_file, keep_default_na=False)
self.entity_id_to_video_id = pd.Series(
list(set(self.comparisons.video_a) | set(self.comparisons.video_b)),
name="video_id",
)
video_id_to_entity_id = {
video_id: entity_id
for (entity_id, video_id) in self.entity_id_to_video_id.items()
}
self.comparisons["entity_a"] = self.comparisons["video_a"].map(
video_id_to_entity_id
)
self.comparisons["entity_b"] = self.comparisons["video_b"].map(
video_id_to_entity_id
)
self.comparisons.drop(columns=["video_a", "video_b"], inplace=True)

with (zipfile.Path(zip_file) / "users.csv").open(mode="rb") as users_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
Expand All @@ -144,25 +167,62 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]):
# Fill trust_score on newly created users for which it was not computed yet
self.users.trust_score = pd.to_numeric(self.users.trust_score).fillna(0.0)

self.username_to_user_id = pd.Series(
data=self.users.index, index=self.users["public_username"]
)
self.comparisons = self.comparisons.join(self.username_to_user_id, on="public_username")
with (zipfile.Path(zip_file) / "collective_criteria_scores.csv").open(mode="rb") as collective_scores_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
collective_scores = pd.read_csv(collective_scores_file, keep_default_na=False)

with (zipfile.Path(zip_file) / "vouchers.csv").open(mode="rb") as vouchers_file:
with (zipfile.Path(zip_file) / "comparisons.csv").open(mode="rb") as comparison_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.vouchers = pd.read_csv(vouchers_file, keep_default_na=False)
comparisons = pd.read_csv(comparison_file, keep_default_na=False)

with (zipfile.Path(zip_file) / "collective_criteria_scores.csv").open(mode="rb") as collective_scores_file:
with (zipfile.Path(zip_file) / "vouchers.csv").open(mode="rb") as vouchers_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.collective_scores = pd.read_csv(collective_scores_file, keep_default_na=False)
self.vouchers = pd.read_csv(vouchers_file, keep_default_na=False)

with (zipfile.Path(zip_file) / "individual_criteria_scores.csv").open(mode="rb") as individual_scores_file:
self.username_to_user_id = pd.Series(
data=self.users.index,
index=self.users["public_username"],
)

self.entity_id_to_video_id = pd.Series(
sorted(
set(comparisons.video_a)
| set(comparisons.video_b)
| set(collective_scores.video)
),
name="video_id",
)

self.video_id_to_entity_id = {
video_id: entity_id
for (entity_id, video_id) in self.entity_id_to_video_id.items()
}

# Convert video ids (str) to entity ids (int)
self.collective_scores = collective_scores.assign(
entity_id=collective_scores["video"].map(self.video_id_to_entity_id)
).drop(columns=["video"])

self.comparisons = comparisons.assign(
entity_a=comparisons["video_a"].map(self.video_id_to_entity_id),
entity_b=comparisons["video_b"].map(self.video_id_to_entity_id),
user_id=comparisons["public_username"].map(self.username_to_user_id),
).drop(columns=["video_a", "video_b"])

with (zipfile.Path(zip_file) / "individual_criteria_scores.csv").open(
mode="rb"
) as individual_scores_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.individual_scores = pd.read_csv(individual_scores_file, keep_default_na=False)
individual_scores = pd.read_csv(individual_scores_file, keep_default_na=False)
# Convert usernames and video_id to user_id and entity_id
self.individual_scores = individual_scores.assign(
entity_id=individual_scores["video"].map(self.video_id_to_entity_id),
user_id=individual_scores["public_username"].map(self.username_to_user_id),
).drop(columns=["public_username", "video"])

@classmethod
def download(cls) -> "TournesolInputFromPublicDataset":
Expand All @@ -178,7 +238,15 @@ def get_comparisons(self, criteria=None, user_id=None) -> pd.DataFrame:
if "score_max" not in dtf:
# For compatibility with older datasets
dtf["score_max"] = 10
return dtf[["user_id", "entity_a", "entity_b", "criteria", "score", "score_max", "weight"]]
return dtf[[
"user_id",
"entity_a",
"entity_b",
"criteria",
"score",
"score_max",
"weight"
]]

@cached_property
def ratings_properties(self):
Expand All @@ -201,11 +269,63 @@ def ratings_properties(self):

def get_individual_scores(
self,
criteria: Optional[str] = None,
user_id: Optional[int] = None,
) -> Optional[pd.DataFrame]:
# TODO: read contributor scores from individual_scores.csv
return None
criteria: Optional[str] = None,
with_n_comparisons = False,
) -> pd.DataFrame:
dtf = self.individual_scores
if criteria is not None:
dtf = dtf[dtf.criteria == criteria]
if user_id is not None:
dtf = dtf[dtf.user_id == user_id]

dtf = dtf[[
"user_id",
"entity_id",
"criteria",
"score",
"uncertainty",
"voting_right",
]]

if with_n_comparisons:
comparison_counts = self.get_comparisons_counts(user_id=user_id, criteria=criteria)
dtf = dtf.merge(
comparison_counts,
how="left",
on=["user_id", "entity_id", "criteria"]
)

return dtf

def get_collective_scores(
self,
entity_id: Optional[str] = None,
criteria: Optional[str] = None,
) -> pd.DataFrame:
dtf: pd.DataFrame = self.collective_scores
if criteria is not None:
dtf = dtf[dtf["criteria"] == criteria]
if entity_id is not None:
dtf = dtf[dtf["entity_id"] == entity_id]

counts = (
self.get_comparisons_counts(criteria=criteria)
.groupby(["criteria", "entity_id"])
.agg(
n_comparisons=("n_comparisons", "sum"),
n_users=("user_id", "nunique"),
)
)

return (
dtf.join(counts, how="left", on=["criteria", "entity_id"])
# Entities that have been compared privately only
# will not appear in comparisons.csv. That's why we need
# to fill for missing values here.
.fillna({"n_comparisons": 0, "n_users": 0})
.astype({"n_comparisons": "int64", "n_users": "int64"})
)

def get_vouches(self):
vouchers = self.vouchers[
Expand All @@ -219,8 +339,3 @@ def get_vouches(self):
"vouch": vouchers.value,
}
)

def get_users(self):
users = self.ratings_properties.groupby("user_id").first()[["trust_score"]]
users["is_pretrusted"] = users["trust_score"] >= 0.8
return users
87 changes: 87 additions & 0 deletions solidago/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,95 @@
from solidago.pipeline import Pipeline
from solidago.pipeline.inputs import TournesolInputFromPublicDataset


@pytest.mark.parametrize("test", range(5))
def test_pipeline_test_data(test):
td = import_module(f"data.data_{test}")
Pipeline()(td.users, td.vouches, td.entities, td.privacy, td.judgments)


def test_tournesol_get_comparisons():
dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip")

# Test no filter
assert len(dataset.get_comparisons()) == 38387

# Test single filter
assert len(dataset.get_comparisons(
criteria="importance"
)) == 17143
assert len(dataset.get_comparisons(
user_id=dataset.username_to_user_id["le_science4all"]
)) == 5604

# Test all filters
assert len(dataset.get_comparisons(
criteria="largely_recommended",
user_id=dataset.username_to_user_id["lpfaucon"]
)) == 8471


def test_tournesol_get_individual_scores():
dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip")

# Test no filter
assert len(dataset.get_individual_scores()) == 17319

# Test single filter
assert len(dataset.get_individual_scores(
criteria="largely_recommended"
)) == 9176
assert len(dataset.get_individual_scores(
user_id=dataset.username_to_user_id["aidjango"]
)) == 4379

# Test all filters
user_id = dataset.username_to_user_id["le_science4all"]
found = dataset.get_individual_scores(
criteria="importance",
user_id=user_id,
with_n_comparisons=True,
)
assert len(found) == 1123
as_dict = found.to_dict(orient="records")[0]
assert as_dict == {
'user_id': user_id,
'criteria': 'importance',
'entity_id': dataset.video_id_to_entity_id["03dTJ4nXkXw"],
'score': 82.81,
'uncertainty': 24.37,
'voting_right': 1.0,
'n_comparisons': 10,
}


def test_tournesol_get_collective_scores():
dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip")

# Test no filter
assert len(dataset.get_collective_scores()) == 12184

# Test single filter
assert len(dataset.get_collective_scores(
criteria="largely_recommended"
)) == 6227
assert len(dataset.get_collective_scores(
entity_id=dataset.video_id_to_entity_id["kX3JKg-H5qM"]
)) == 2

# Test all filters
entity_id = dataset.video_id_to_entity_id["OlhC6n9Hhac"]
found = dataset.get_collective_scores(
criteria="importance",
entity_id=entity_id
)
assert len(found) == 1
as_dict = found.to_dict(orient="records")[0]
assert as_dict == {
'entity_id': entity_id,
'criteria': 'importance',
'score': 18.22,
'uncertainty': 60.09,
'n_users': 3,
'n_comparisons': 12,
}
Loading