Skip to content

Commit

Permalink
Merge branch 'main' into 1940-replace_sliders_by_buttons
Browse files Browse the repository at this point in the history
  • Loading branch information
GresilleSiffle committed Nov 7, 2024
2 parents 07fe339 + 836493a commit 10870dc
Show file tree
Hide file tree
Showing 23 changed files with 371 additions and 153 deletions.
37 changes: 19 additions & 18 deletions backend/ml/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from django.db.models import Case, F, Q, QuerySet, When
from django.db.models.expressions import RawSQL
from solidago.pipeline import TournesolInput
from solidago.pipeline import PipelineInput

from core.models import User
from tournesol.models import (
Expand All @@ -17,7 +17,7 @@
from vouch.models import Voucher


class MlInputFromDb(TournesolInput):
class MlInputFromDb(PipelineInput):
SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE = 20

def __init__(self, poll_name: str):
Expand Down Expand Up @@ -58,38 +58,38 @@ def get_scaling_calibration_users(self) -> QuerySet[User]:
n_compared_entities__gte=self.SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE,
).order_by("-n_compared_entities")[: self.MAX_SCALING_CALIBRATION_USERS]

def get_comparisons(self, criteria=None, user_id=None) -> pd.DataFrame:
def get_comparisons(self, criterion=None, user_id=None) -> pd.DataFrame:
scores_queryset = ComparisonCriteriaScore.objects.filter(
comparison__poll__name=self.poll_name,
comparison__user__is_active=True,
)
if criteria is not None:
scores_queryset = scores_queryset.filter(criteria=criteria)
if criterion is not None:
scores_queryset = scores_queryset.filter(criteria=criterion)

if user_id is not None:
scores_queryset = scores_queryset.filter(comparison__user_id=user_id)

values = scores_queryset.values(
"score",
"score_max",
"criteria",
"weight",
criterion=F("criteria"),
entity_a=F("comparison__entity_1_id"),
entity_b=F("comparison__entity_2_id"),
user_id=F("comparison__user_id"),
)
if len(values) > 0:
dtf = pd.DataFrame(values)
return dtf[
["user_id", "entity_a", "entity_b", "criteria", "score", "score_max", "weight"]
["user_id", "entity_a", "entity_b", "criterion", "score", "score_max", "weight"]
]

return pd.DataFrame(
columns=[
"user_id",
"entity_a",
"entity_b",
"criteria",
"criterion",
"score",
"score_max",
"weight",
Expand Down Expand Up @@ -136,7 +136,7 @@ def get_user_scalings(self, user_id=None) -> pd.DataFrame:
Returns:
- ratings_df: DataFrame with columns
* `user_id`: int
* `criteria`: str
* `criterion`: str
* `scale`: float
* `scale_uncertainty`: float
* `translation`: float
Expand All @@ -148,17 +148,18 @@ def get_user_scalings(self, user_id=None) -> pd.DataFrame:
scalings = scalings.filter(user_id=user_id)
values = scalings.values(
"user_id",
"criteria",
"scale",
"scale_uncertainty",
"translation",
"translation_uncertainty",
criterion=F("criteria"),

)
if len(values) == 0:
return pd.DataFrame(
columns=[
"user_id",
"criteria",
"criterion",
"scale",
"scale_uncertainty",
"translation",
Expand All @@ -168,28 +169,28 @@ def get_user_scalings(self, user_id=None) -> pd.DataFrame:
return pd.DataFrame(values)

def get_individual_scores(
self, criteria: Optional[str] = None, user_id: Optional[int] = None
self, user_id: Optional[int] = None, criterion: Optional[str] = None,
) -> pd.DataFrame:
scores_queryset = ContributorRatingCriteriaScore.objects.filter(
contributor_rating__poll__name=self.poll_name,
contributor_rating__user__is_active=True,
)
if criteria is not None:
scores_queryset = scores_queryset.filter(criteria=criteria)
if criterion is not None:
scores_queryset = scores_queryset.filter(criteria=criterion)
if user_id is not None:
scores_queryset = scores_queryset.filter(contributor_rating__user_id=user_id)

values = scores_queryset.values(
"raw_score",
"criteria",
entity=F("contributor_rating__entity_id"),
criterion=F("criteria"),
entity_id=F("contributor_rating__entity_id"),
user_id=F("contributor_rating__user_id"),
)
if len(values) == 0:
return pd.DataFrame(columns=["user_id", "entity", "criteria", "raw_score"])
return pd.DataFrame(columns=["user_id", "entity_id", "criterion", "raw_score"])

dtf = pd.DataFrame(values)
return dtf[["user_id", "entity", "criteria", "raw_score"]]
return dtf[["user_id", "entity_id", "criterion", "raw_score"]]

def get_vouches(self):
values = Voucher.objects.filter(
Expand Down
2 changes: 2 additions & 0 deletions backend/ml/management/commands/ml_train.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from functools import cache

from django import db
from django.conf import settings
Expand All @@ -18,6 +19,7 @@
from tournesol.models.poll import ALGORITHM_MEHESTAN, DEFAULT_POLL_NAME


@cache
def get_solidago_pipeline(run_trust_propagation: bool = True):
if run_trust_propagation:
trust_algo = LipschiTrust()
Expand Down
12 changes: 6 additions & 6 deletions backend/ml/mehestan/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from django import db
from django.conf import settings
from solidago.pipeline import TournesolInput
from solidago.pipeline import PipelineInput
from solidago.pipeline.legacy2023.criterion_pipeline import run_pipeline_for_criterion
from solidago.pipeline.legacy2023.individual_scores import get_individual_scores

Expand All @@ -21,15 +21,15 @@
def update_user_scores(poll: Poll, user: User):
params = MehestanParameters()
ml_input = MlInputFromDb(poll_name=poll.name)
for criteria in poll.criterias_list:
output = TournesolPollOutput(poll_name=poll.name, criterion=criteria)
for criterion in poll.criterias_list:
output = TournesolPollOutput(poll_name=poll.name, criterion=criterion)
scores = get_individual_scores(
ml_input,
criteria,
criterion,
parameters=params,
single_user_id=user.pk,
)
scores["criteria"] = criteria
scores["criterion"] = criterion
scores.rename(
columns={
"score": "raw_score",
Expand All @@ -45,7 +45,7 @@ def close_db_connection_callback():


def run_mehestan(
ml_input: TournesolInput, poll: Poll, parameters: MehestanParameters, main_criterion_only=False
ml_input: PipelineInput, poll: Poll, parameters: MehestanParameters, main_criterion_only=False
):
"""
This function use multiprocessing.
Expand Down
19 changes: 11 additions & 8 deletions backend/ml/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def save_individual_scores(
raw_uncertainty=row.raw_uncertainty,
voting_right=row.voting_right,
)
for _, row in scores.iterrows()
for row in scores.itertuples()
),
batch_size=10000,
)
Expand Down Expand Up @@ -257,7 +257,7 @@ def apply_score_scalings(poll: Poll, contributor_scores: pd.DataFrame):
contributor_scores: DataFrame with columns:
user_id: int
entity_id: int
criteria: str
criterion: str
raw_score: float
raw_uncertainty: float
Expand All @@ -270,14 +270,17 @@ def apply_score_scalings(poll: Poll, contributor_scores: pd.DataFrame):
return contributor_scores

ml_input = MlInputFromDb(poll_name=poll.name)
scalings = ml_input.get_user_scalings().set_index(["user_id", "criteria"])
scalings = ml_input.get_user_scalings().set_index(["user_id", "criterion"])
contributor_scores = contributor_scores.join(
scalings, on=["user_id", "criteria"], how="left"
scalings, on=["user_id", "criterion"], how="left"
).fillna(
{
"scale": 1.0,
"translation": 0.0,
"scale_uncertainty": 0.0,
"translation_uncertatinty": 0.0,
}
)
contributor_scores["scale"].fillna(1, inplace=True)
contributor_scores["translation"].fillna(0, inplace=True)
contributor_scores["scale_uncertainty"].fillna(0, inplace=True)
contributor_scores["translation_uncertainty"].fillna(0, inplace=True)

# Apply individual scaling
contributor_scores["uncertainty"] = (
Expand Down
6 changes: 3 additions & 3 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ PyYAML==6.0.1
langdetect==1.0.9
# Pandas is used extensively in the ML algorithms and for some data management
# tasks such as building the public dataset
pandas==2.1.2
pandas==2.2.3
# Numba provides just-in-time compilation to run optimized machine code
# for performance-critical functions in Mehestan implementation.
numba==0.58.1
numba==0.60.0
# Numpy is used extensively in the ML algorithms and in some other algorithms
# such as computing comparison suggestions. See https://numpy.org/
# Check the compatibility with Numba before upgrading.
numpy==1.26.1
numpy==1.26.4
# Scipy is used in some ML algorithms
scipy==1.11.3
# API Youtube data
Expand Down
8 changes: 4 additions & 4 deletions backend/tournesol/management/commands/load_public_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.db import transaction
from solidago.pipeline.inputs import TournesolInputFromPublicDataset
from solidago.pipeline.inputs import TournesolDataset

from core.models import User
from core.models.user import EmailDomain
Expand All @@ -27,7 +27,7 @@ def add_arguments(self, parser):
parser.add_argument("--user-sampling", type=float, default=None)
parser.add_argument("--dataset-url", type=str, default=PUBLIC_DATASET_URL)

def create_user(self, username: str, ml_input: TournesolInputFromPublicDataset):
def create_user(self, username: str, ml_input: TournesolDataset):
user = ml_input.users.loc[ml_input.users.public_username == username].iloc[0]
is_pretrusted = user.trust_score > 0.5
email = f"{username}@trusted.example" if is_pretrusted else f"{username}@example.com"
Expand Down Expand Up @@ -66,7 +66,7 @@ def create_test_user(self):
)

def handle(self, *args, **options):
public_dataset = TournesolInputFromPublicDataset(options["dataset_url"])
public_dataset = TournesolDataset(options["dataset_url"])
nb_comparisons = 0

with transaction.atomic():
Expand Down Expand Up @@ -108,7 +108,7 @@ def handle(self, *args, **options):
for values in rows.itertuples(index=False):
ComparisonCriteriaScore.objects.create(
comparison=comparison,
criteria=values.criteria,
criteria=values.criterion,
score=values.score,
score_max=values.score_max,
)
Expand Down
6 changes: 3 additions & 3 deletions backend/tournesol/tests/test_api_exports.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from django.test import TransactionTestCase, override_settings
from rest_framework import status
from rest_framework.test import APIClient
from solidago.pipeline.inputs import TournesolInputFromPublicDataset
from solidago.pipeline.inputs import TournesolDataset

from core.models import User
from core.tests.factories.user import UserFactory
Expand Down Expand Up @@ -542,14 +542,14 @@ def test_use_public_export_as_ml_input(self):
self.assertEqual(response.status_code, status.HTTP_200_OK)
zip_content = io.BytesIO(response.content)

ml_input = TournesolInputFromPublicDataset(zip_content)
ml_input = TournesolDataset(zip_content)
comparisons_df = ml_input.get_comparisons()
rating_properties = ml_input.ratings_properties

self.assertEqual(len(comparisons_df), 1)
self.assertEqual(
list(comparisons_df.columns),
["user_id", "entity_a", "entity_b", "criteria", "score", "score_max", "weight"],
["user_id", "entity_a", "entity_b", "criterion", "score", "score_max", "weight"],
)

self.assertEqual(len(rating_properties), 2)
Expand Down
1 change: 1 addition & 0 deletions solidago/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/devenv/
/dist/
**__pycache__/
.flake8/
Expand Down
4 changes: 2 additions & 2 deletions solidago/experiments/data_analysis.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from solidago.pipeline.inputs import TournesolInputFromPublicDataset
from solidago.pipeline.inputs import TournesolDataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy

data = TournesolInputFromPublicDataset.download()
data = TournesolDataset.download()

criteria = {
"reliability": "Reliable and not misleading",
Expand Down
4 changes: 2 additions & 2 deletions solidago/experiments/tournesol.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from threading import Thread

from solidago.pipeline.inputs import TournesolInputFromPublicDataset
from solidago.pipeline.inputs import TournesolDataset

from solidago.trust_propagation import LipschiTrust
from solidago.voting_rights import AffineOvertrust
Expand All @@ -32,7 +32,7 @@
info_logger.addHandler(ch)

logger.info("Retrieve public dataset")
inputs = TournesolInputFromPublicDataset.download()
inputs = TournesolDataset.download()
video_id_to_entity_id = {
video_id: entity_id
for entity_id, video_id in enumerate(inputs.entity_id_to_video_id)
Expand Down
2 changes: 1 addition & 1 deletion solidago/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ keywords = ["tournesol", "collaborative recommendations", "judgement aggregation
dependencies = [
"pandas>=1.5.3,<3.0",
"numpy>=1.24.3,<1.27",
"numba==0.58.1",
"numba==0.60.0",
]
dynamic = ["version"]

Expand Down
2 changes: 1 addition & 1 deletion solidago/src/solidago/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Changing the version will automatically publish a new version on PyPI.
# (see /.github/workflows/solidago-publish.yml)

__version__ = "0.2.0"
__version__ = "0.3.0"
4 changes: 2 additions & 2 deletions solidago/src/solidago/pipeline/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .inputs import TournesolInput
from .inputs import PipelineInput
from .outputs import PipelineOutput
from .pipeline import DefaultPipeline, Pipeline

__all__ = ["TournesolInput", "DefaultPipeline", "Pipeline", "PipelineOutput"]
__all__ = ["PipelineInput", "DefaultPipeline", "Pipeline", "PipelineOutput"]
Loading

0 comments on commit 10870dc

Please sign in to comment.