Skip to content

Commit

Permalink
The full pipeline is now functional, and can executed on generated data.
Browse files Browse the repository at this point in the history
I am however not extremely confident about correctness so far.
A lot more testing is needed.
  • Loading branch information
lenhoanglnh committed Jan 21, 2024
1 parent 3ecad63 commit e03e48c
Show file tree
Hide file tree
Showing 11 changed files with 99 additions and 101 deletions.
26 changes: 20 additions & 6 deletions solidago/src/solidago/generative_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
import numpy as np
import pandas as pd

from solidago.pipeline.inputs import SimpleInput

from .user_model import UserModel, SvdUserModel
from .vouch_model import VouchModel, ErdosRenyiVouchModel
from .entity_model import EntityModel, SvdEntityModel
from .engagement_model import EngagementModel, SimpleEngagementModel
from .comparison_model import ComparisonModel, KnaryGBT

from solidago.privacy_settings import PrivacySettings
from solidago.judgments import DataFrameJudgments


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -51,7 +52,7 @@ def __init__(
def __call__(
self, n_users: int, n_entities: int,
random_seed: Optional[int] = None
) -> SimpleInput:
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, PrivacySettings, DataFrameJudgments]:
""" Generates a random dataset
Parameters
Expand All @@ -65,9 +66,22 @@ def __call__(
Returns
-------
out: solidago.pipeline.SimpleInput
Generated data, with attributes users, vouches, entities, true_scores,
scores and comparisons, all of types DataFrame.
users: DataFrame with columns
* user_id
vouches: DataFrame with columns
* voucher
* vouchee
* vouch
entities: DataFrame with columns
* entity_id
privacy: PrivacySettings
privacy[user, entity] in { True, False, None }
judgments: DataFrameJudgments
judgments[user]["comparisons"] is user's DataFrame with columns
* entity_a
* entity_b
* comparison
* comparison_max
"""
if random_seed is not None:
assert type(random_seed) == int
Expand Down
6 changes: 4 additions & 2 deletions solidago/src/solidago/pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,11 @@ def __call__(

if 3 not in skip_steps:
logger.info(f"Pipeline 3. Learning preference models with {self.preference_learning}")
init_user_models = dict() if init_user_models is None else init_user_models
user_models = dict() if init_user_models is None else init_user_models
for user, _ in users.iterrows():
init_model = init_user_models[user] if user in init_user_models else None
init_model = None
if init_user_models is not None and user in init_user_models:
init_model = init_user_models[user]
user_models[user] = self.preference_learning(judgments[user], entities, init_model)
else:
logger.info(f"Pipeline 3. Learning preference models is skipped")
Expand Down
64 changes: 0 additions & 64 deletions solidago/src/solidago/pipeline/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,67 +136,3 @@ def get_user_index(self, public_username: str) -> Optional[int]:
if len(rows) == 0:
return None
return rows.index[0]

class SimpleInput(TournesolInput):
def __init__(
self,
users: pd.DataFrame = None,
vouches: pd.DataFrame = None,
entities: pd.DataFrame = None,
true_scores: pd.DataFrame = None,
privacy: pd.DataFrame = None,
comparisons: pd.DataFrame = None
):
def df(x, **kwargs):
if x is not None:
return x
dtypes = [(key, kwargs[key]) for key in kwargs]
return pd.DataFrame(np.empty(0, np.dtype(list(dtypes))))

self.users = df(users, user_id=int, public_username=str, trust_score= float)
self.users.index.name = "user_id"
self.vouches = df(vouches, voucher=int, vouchee=int, vouch=float)
self.entities = entities
self.true_scores = true_scores
self.privacy = df(privacy, user_id=int, entity_id=int, is_public=bool)
self.comparisons = df(comparisons,
user_id=int, score=float, week_date=str, entity_a=int, entity_b=int)

def get_comparisons(
self,
criteria: Optional[str] = None,
user_id: Optional[int] = None,
) -> pd.DataFrame:
dtf = self.comparisons.copy(deep=False)
if criteria is not None:
dtf = dtf[dtf.criteria == criteria]
if user_id is not None:
dtf = dtf[dtf.user_id == user_id]
dtf["weight"] = 1
return dtf[["user_id", "entity_a", "entity_b", "criteria", "score", "weight"]]

@cached_property
def ratings_properties(self):
user_entities_pairs = pd.Series(
iter(
set(self.comparisons.groupby(["user_id", "entity_a"]).indices.keys())
| set(self.comparisons.groupby(["user_id", "entity_b"]).indices.keys())
)
)
dtf = pd.DataFrame([*user_entities_pairs], columns=["user_id", "entity_id"])
dtf["is_public"] = True
dtf["trust_score"] = dtf["user_id"].map(self.users["trust_score"])
scaling_calibration_user_ids = (
dtf[dtf.trust_score > self.SCALING_CALIBRATION_MIN_TRUST_SCORE]["user_id"]
.value_counts(sort=True)[: self.MAX_SCALING_CALIBRATION_USERS]
.index
)
dtf["is_scaling_calibration_user"] = dtf["user_id"].isin(scaling_calibration_user_ids)
return dtf

def get_individual_scores(
self,
criteria: Optional[str] = None,
user_id: Optional[int] = None,
) -> Optional[pd.DataFrame]:
raise NotImplementedError
3 changes: 2 additions & 1 deletion solidago/src/solidago/primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,8 @@ def br_mean(
if isinstance(voting_rights, float):
voting_rights = np.full(values.shape, voting_rights)

if np.sum(voting_rights) == 0:
total_voting_rights = np.sum(voting_rights)
if total_voting_rights == 0:
return default_value

return clip_mean(
Expand Down
6 changes: 4 additions & 2 deletions solidago/src/solidago/scaling/mehestan.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def compute_multiplicators(
multiplicators[user][1] is the uncertainty on the multiplicator
"""
return {
u: _aggregate(self.lipschitz / (8 / model_norms[u]), ratios[u], self.error)
u: _aggregate(self.lipschitz / (8 / model_norms[u]), ratios[u], 1, self.error)
for u in ratios
}

Expand Down Expand Up @@ -399,7 +399,7 @@ def compute_translations(
translations[user][1] is the uncertainty on the multiplicator
"""
return {
u: _aggregate(self.lipschitz / 8, diffs[u], self.error, br_mean)
u: _aggregate(self.lipschitz / 8, diffs[u], 0, self.error, br_mean)
for u in diffs
}

Expand Down Expand Up @@ -613,6 +613,7 @@ def _aggregate_user_comparisons(
def _aggregate(
lipschitz: float,
values: tuple[list[float], list[float], list[float]],
default_value: float,
error: float=1e-5,
aggregator: callable=qr_median
) -> dict[int, tuple[float, float]]:
Expand All @@ -639,6 +640,7 @@ def _aggregate(
voting_rights=np.array(values[1]),
left_uncertainties=np.array(values[2]),
right_uncertainties=np.array(values[2]),
default_value=default_value,
error=error
)
uncertainty = qr_uncertainty(
Expand Down
2 changes: 1 addition & 1 deletion solidago/src/solidago/scaling/quantile_zero_shift.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __call__(
lefts.append(output[1])
rights.append(output[2])

shift = qr_quantile(self.lipschitz, self.zero_quantile, np.array(scores),
shift = - qr_quantile(self.lipschitz, self.zero_quantile, np.array(scores),
np.array(votes), np.array(lefts), np.array(rights), error=self.error)

return {
Expand Down
81 changes: 62 additions & 19 deletions solidago/src/solidago/scoring_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ def scored_entities(self, entities=None) -> set[int]:
return set(self._dict.keys())
return set(entities.index).intersection(set(self._dict.keys()))

def __str__(self):
return "{\n " + ",\n ".join([
def __str__(self, indent=""):
return "{" + f"\n{indent} " + f",\n{indent} ".join([
f"{entity}: {np.round(self[entity][0], 2)} "
+ f"[-{np.round(self[entity][1], 2)}, "
+ f"+{np.round(self[entity][2], 2)}]"
for entity in self.scored_entities()
]) + "\n}"
]) + f"\n{indent}" + "}"

def get_scaling_parameters(self):
return 1, 0, 0, 0, 0, 0
Expand All @@ -86,31 +86,62 @@ def __init__(
translation_left_uncertainty: float=0,
translation_right_uncertainty: float=0
):
self.base_model = base_model
self.multiplicator = multiplicator
self.translation = translation
self.multiplicator_left_uncertainty = multiplicator_left_uncertainty
self.multiplicator_right_uncertainty = multiplicator_right_uncertainty
self.translation_left_uncertainty = translation_left_uncertainty
self.translation_right_uncertainty = translation_right_uncertainty
""" When base_model is itself a scaled scoring model,
the scalings are aggregated, so that the base model is actually
the scaled scoring model's base model.
Note that this requires aggregating the uncertainties in a heuristic manner.
At the core, this is because the uncertainties should grow quadratically
with the size of the scores. Put differently, because of uncertainties,
the composition of scaled scoring models is not an internal composition law
(but it is if uncertainties are not accounted for).
"""
if isinstance(base_model, ScaledScoringModel):
self.base_model = base_model.base_model
self.multiplicator = multiplicator * base_model.multiplicator
self.translation = translation + multiplicator * base_model.translation
self.multiplicator_left_uncertainty = multiplicator_left_uncertainty \
+ multiplicator * base_model.multiplicator_left_uncertainty
self.multiplicator_right_uncertainty = multiplicator_right_uncertainty \
+ multiplicator * base_model.multiplicator_right_uncertainty
self.translation_left_uncertainty = translation_left_uncertainty \
+ multiplicator * base_model.translation_left_uncertainty
self.translation_right_uncertainty = translation_right_uncertainty \
+ multiplicator * base_model.translation_right_uncertainty
else:
self.base_model = base_model
self.multiplicator = multiplicator
self.translation = translation
self.multiplicator_left_uncertainty = multiplicator_left_uncertainty
self.multiplicator_right_uncertainty = multiplicator_right_uncertainty
self.translation_left_uncertainty = translation_left_uncertainty
self.translation_right_uncertainty = translation_right_uncertainty

def __call__(self, entity_id, entity_features):
base_output = self.base_model(entity_id, entity_features)
if base_output is None:
return None

base_score, base_left, base_right = base_output
score = self.multiplicator * base_score + self.translation
base_score, base_left_uncertainty, base_right_uncertainty = base_output
base_left = base_score - base_left_uncertainty
base_right = base_score + base_right_uncertainty

left = self.multiplicator * base_left
left += np.abs(score) * self.multiplicator_left_uncertainty
left += self.translation_left_uncertainty
score = self.multiplicator * base_score + self.translation

right = self.multiplicator * base_right
right += np.abs(score) * self.multiplicator_right_uncertainty
right += self.translation_right_uncertainty
left_uncertainty = self.multiplicator * base_left_uncertainty
left_uncertainty += self.translation_left_uncertainty
if base_left > 0:
left_uncertainty += base_left * self.multiplicator_left_uncertainty
else:
left_uncertainty += (- base_left) * self.multiplicator_right_uncertainty

return score, left, right
right_uncertainty = self.multiplicator * base_right_uncertainty
right_uncertainty += self.translation_right_uncertainty
if base_right > 0:
right_uncertainty += base_right * self.multiplicator_right_uncertainty
else:
right_uncertainty += (- base_right) * self.multiplicator_left_uncertainty

return score, left_uncertainty, right_uncertainty

def scored_entities(self, entities=None) -> set[int]:
return self.base_model.scored_entities(entities)
Expand All @@ -127,6 +158,18 @@ def get_scaling_parameters(self):
parameters.append(model._direct_scaling_parameters())
return ScaledScoringModel.compose_scaling_parameters(parameters)

def __str__(self, indent=""):
result = indent + "{\n"
result += f"{indent} multiplicator = {np.round(self.multiplicator, 2)}"
result += f" [{np.round(self.multiplicator_left_uncertainty, 2)}, "
result += f"{np.round(self.multiplicator_right_uncertainty, 2)}]\n{indent}"
result += f" translation = {np.round(self.translation, 2)}"
result += f" [{np.round(self.translation_left_uncertainty, 2)}, "
result += f"{np.round(self.translation_right_uncertainty, 2)}]\n{indent}"
result += " base_model = " + self.base_model.__str__(f" {indent}")
result += "\n" + indent + "}"
return result

@classmethod
def compose_scaling_parameters(parameters):
result = 1, 0, 0, 0, 0, 0
Expand Down
2 changes: 1 addition & 1 deletion solidago/src/solidago/test/data_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
),
scaling=ScalingCompose(
Mehestan(
lipschitz=0.1,
lipschitz=10,
min_activity=0.1,
n_scalers_max=100,
privacy_penalty=0.5,
Expand Down
2 changes: 1 addition & 1 deletion solidago/src/solidago/test/data_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
),
scaling=ScalingCompose(
Mehestan(
lipschitz=0.1,
lipschitz=100,
min_activity=0.1,
n_scalers_max=100,
privacy_penalty=0.5,
Expand Down
4 changes: 2 additions & 2 deletions solidago/src/solidago/test/data_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@
),
scaling=ScalingCompose(
Mehestan(
lipschitz=0.1,
min_activity=0.2,
lipschitz=10,
min_activity=0.1,
n_scalers_max=100,
privacy_penalty=0.5,
p_norm_for_multiplicative_resilience=4.0,
Expand Down
4 changes: 2 additions & 2 deletions solidago/src/solidago/test/data_4.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,8 @@
),
scaling=ScalingCompose(
Mehestan(
lipschitz=0.1,
min_activity=1,
lipschitz=1,
min_activity=0.1,
n_scalers_max=100,
privacy_penalty=0.5,
p_norm_for_multiplicative_resilience=4.0,
Expand Down

0 comments on commit e03e48c

Please sign in to comment.