Skip to content

Commit

Permalink
Update launch and eval scripts for the eval phase of the contest
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 700127292
Change-Id: I0eadabe74fbfed9f8a74a98703ed05beee9dd5e4
  • Loading branch information
jzleibo authored and copybara-github committed Nov 26, 2024
1 parent af718c8 commit 7cbf802
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 101 deletions.
2 changes: 1 addition & 1 deletion concordia/language_model/call_limit_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class CallLimitLanguageModel(language_model.LanguageModel):
def __init__(
self,
model: language_model.LanguageModel,
max_calls: int = 1000,
max_calls: int = 1200,
) -> None:
"""Wrap the underlying language model with a call limit.
Expand Down
31 changes: 23 additions & 8 deletions examples/modular/calculate_ratings.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,17 @@
# Parse command line arguments
args = parser.parse_args()

sanitized_model_name = args.model_name.replace('/', '_')

# Load data
included = {}
included_agent_idx = 0
sorted_agent_names = sorted(args.agents)
max_repetition_idx = -1
for agent_name in sorted_agent_names:
print(f'loading data from: {agent_name}')
json_filename = f'{agent_name}__{args.model_name}__{args.embedder_name}.json'
json_filename = (
f'{agent_name}__{sanitized_model_name}__{args.embedder_name}.json')

loaded = file_utils.load_from_json_file(json_filename)
scenario_results_to_include = {}
Expand Down Expand Up @@ -93,14 +97,23 @@
f' {expected_background_agent}'
)

if result.scenario in scenario_results_to_include:
raise RuntimeError(f'Duplicate scenario: {result.scenario}')
repetition_idx = int(result.repetition_idx)
max_repetition_idx = max(max_repetition_idx, repetition_idx)
scenario_with_repetition = f'{result.scenario}_{repetition_idx}'

if scenario_with_repetition in scenario_results_to_include:
raise RuntimeError(f'Duplicate scenario: {scenario_with_repetition}')

scenario_results_to_include[result.scenario] = result
scenario_results_to_include[scenario_with_repetition] = result

# Check there are results for all scenarios.
expected_scenarios = []
for expected_scenario in set(scenarios_lib.SCENARIO_CONFIGS.keys()):
for repetition_idx in range(max_repetition_idx + 1):
expected_scenarios.append(f'{expected_scenario}_{repetition_idx}')
expected_scenarios = set(expected_scenarios)
scenarios_found = set(scenario_results_to_include.keys())
if scenarios_found == set(scenarios_lib.SCENARIO_CONFIGS.keys()):
if scenarios_found == expected_scenarios:
included[agent_name] = dict(
agent_idx=included_agent_idx, results=scenario_results_to_include
)
Expand All @@ -112,16 +125,18 @@
# the data from the previous runs with other agent submissions.
# We need to form a score matrix with shape [num_scenarios X num_agents]
num_scenarios = len(scenarios_lib.SCENARIO_CONFIGS)
num_scenarios_and_repetitions = num_scenarios * (max_repetition_idx + 1)
agents_to_evaluate = list(included.keys())
num_agents_to_evaluate = len(agents_to_evaluate)
score_matrix = np.zeros((num_scenarios, num_agents_to_evaluate))
score_matrix = np.zeros((num_scenarios_and_repetitions, num_agents_to_evaluate))
for agent_name in agents_to_evaluate:
results_per_scenario = included[agent_name]['results']

num_scenarios_found = len(results_per_scenario)
assert (
num_scenarios_found == num_scenarios
), f'Wrong number of scenarios: {num_scenarios_found} != {num_scenarios}'
num_scenarios_found == num_scenarios_and_repetitions
), ('Wrong number of scenarios: '
f'{num_scenarios_found} != {num_scenarios_and_repetitions}')

names_by_scenario_vector = np.array(
[result.scenario for result in results_per_scenario.values()]
Expand Down
80 changes: 36 additions & 44 deletions examples/modular/launch_concordia_challenge_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,11 +243,6 @@ def _evaluate_all_repetitions_on_one_scenario(
"""
print(f'Running scenario: {scenario_name}')
# Run several simulations per scenario
simulation_outcomes = []
focal_per_capita_scores_to_average = []
background_per_capita_scores_to_average = []
ungrouped_per_capita_scores_to_average = []

tasks_this_scenario = {
str(i): functools.partial(
_evaluate_one_repetition,
Expand All @@ -267,6 +262,7 @@ def _evaluate_all_repetitions_on_one_scenario(
'Raised errors', list(exceptions_per_repetition.values())
)

scenario_results = []
for repetition_idx, outcome in outputs_per_repetition.items():
if scenario_config.focal_is_resident:
focal_scores = list(outcome.resident_scores.values())
Expand All @@ -279,45 +275,38 @@ def _evaluate_all_repetitions_on_one_scenario(
# Calculate per capita scores.
print(f'\nScores for repetition {repetition_idx}:')
focal_per_capita_score = np.mean(focal_scores)
focal_per_capita_scores_to_average.append(focal_per_capita_score)
print(f' Focal per capita score: {focal_per_capita_score}')
background_per_capita_score = np.mean(background_scores)
background_per_capita_scores_to_average.append(background_per_capita_score)
print(f' Background per capita score: {background_per_capita_score}')
ungrouped_per_capita_score = np.mean(ungrouped_scores)
ungrouped_per_capita_scores_to_average.append(ungrouped_per_capita_score)
print(f' Ungrouped per capita score: {ungrouped_per_capita_score}')

# Average scores over repetitions and save results for all repetitions in a
# json-serializable format.
scenario_result_ = logging_lib.ScenarioResult(
scenario=scenario_name,
focal_agent=args.agent_name,
background_agent=scenario_config.background_agent_module,
focal_per_capita_score=np.mean(focal_per_capita_scores_to_average),
background_per_capita_score=np.mean(
background_per_capita_scores_to_average
),
ungrouped_per_capita_score=np.mean(
ungrouped_per_capita_scores_to_average
),
simulation_outcomes=tuple(simulation_outcomes),
focal_is_resident=scenario_config.focal_is_resident,
api_type=args.api_type,
model=args.model_name,
embedder=args.embedder_name,
disable_language_model=args.disable_language_model,
exclude_from_elo_calculation=args.exclude_from_elo_calculation,
)
scenario_json_filename = (
f'{args.agent_name}__{args.model_name}__'
f'{args.embedder_name}__only_{scenario_name}.json'
).replace('/', '_')
scenario_json_filename = os.path.join(results_dir, scenario_json_filename)
json_str_ = scenario_result_.to_json()
with open(scenario_json_filename, 'a', encoding='utf-8') as f:
f.write(json_str_)
return scenario_result_
scenario_result_ = logging_lib.ScenarioResult(
scenario=scenario_name,
repetition_idx=repetition_idx,
focal_agent=args.agent_name,
background_agent=scenario_config.background_agent_module,
focal_per_capita_score=focal_per_capita_score,
background_per_capita_score=background_per_capita_score,
ungrouped_per_capita_score=ungrouped_per_capita_score,
simulation_outcome=outcome,
focal_is_resident=scenario_config.focal_is_resident,
api_type=args.api_type,
model=args.model_name,
embedder=args.embedder_name,
disable_language_model=args.disable_language_model,
exclude_from_elo_calculation=args.exclude_from_elo_calculation,
)
scenario_json_filename = (
f'{args.agent_name}__{args.model_name}__'
f'{args.embedder_name}__only__{scenario_name}__{repetition_idx}.json'
).replace('/', '_')
scenario_json_filename = os.path.join(results_dir, scenario_json_filename)
json_str_ = scenario_result_.to_json()
with open(scenario_json_filename, 'a', encoding='utf-8') as f:
f.write(json_str_)
scenario_results.append(scenario_result_)
return scenario_results

tasks = {
name: functools.partial(
Expand All @@ -330,16 +319,19 @@ def _evaluate_all_repetitions_on_one_scenario(
evaluation_results = concurrency.run_tasks(tasks)

# Save evaluation results for all scenarios with this agent to one json file.
num_expected_results = (len(scenarios_lib.SCENARIO_CONFIGS) *
args.num_repetitions_per_scenario)
json_filename = (
f'{args.agent_name}__{args.model_name}__{args.embedder_name}.json'
).replace('/', '_')
idx = 0
with open(json_filename, 'a', encoding='utf-8') as file_handle:
file_handle.write('[\n')
for scenario_name_, scenario_result in evaluation_results.items():
json_str = evaluation_results[scenario_name_].to_json()
if idx < len(scenarios_lib.SCENARIO_CONFIGS) - 1:
json_str += ',\n'
file_handle.write(json_str)
idx += 1
for scenario_name_, _ in evaluation_results.items():
for scenario_result in evaluation_results[scenario_name_]:
json_str = scenario_result.to_json()
if idx < num_expected_results - 1:
json_str += ',\n'
file_handle.write(json_str)
idx += 1
file_handle.write('\n]')
58 changes: 24 additions & 34 deletions examples/modular/launch_one_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,6 @@
print(f'Running scenario: {args.scenario_name}')
scenario_config = scenarios_lib.SCENARIO_CONFIGS[args.scenario_name]
# Run several simulations per scenario
simulation_outcomes = []
focal_per_capita_scores_to_average = []
background_per_capita_scores_to_average = []
ungrouped_per_capita_scores_to_average = []
for repetition_idx in range(args.num_repetitions_per_scenario):
measurements = measurements_lib.Measurements()
runnable_simulation = scenarios_lib.build_simulation(
Expand All @@ -205,7 +201,6 @@
)
# Run the simulation
outcome, text_results_log = runnable_simulation()
simulation_outcomes.append(outcome)
if scenario_config.focal_is_resident:
focal_scores = list(outcome.resident_scores.values())
background_scores = list(outcome.visitor_scores.values())
Expand All @@ -217,13 +212,10 @@
# Calculate per capita scores.
print('\nScores:')
focal_per_capita_score = np.mean(focal_scores)
focal_per_capita_scores_to_average.append(focal_per_capita_score)
print(f' Focal per capita score: {focal_per_capita_score}')
background_per_capita_score = np.mean(background_scores)
background_per_capita_scores_to_average.append(background_per_capita_score)
print(f' Background per capita score: {background_per_capita_score}')
ungrouped_per_capita_score = np.mean(ungrouped_scores)
ungrouped_per_capita_scores_to_average.append(ungrouped_per_capita_score)
print(f' Ungrouped per capita score: {ungrouped_per_capita_score}')
# Write the full text log as an HTML file in the current working directory.
html_filename = (
Expand All @@ -234,29 +226,27 @@
with open(html_filename, 'a', encoding='utf-8') as f:
f.write(text_results_log)

# Average scores over repetitions and save results for all repetitions in a
# json-serializable format.
scenario_result = logging_lib.ScenarioResult(
scenario=args.scenario_name,
focal_agent=args.agent_name,
background_agent=scenario_config.background_agent_module,
focal_per_capita_score=np.mean(focal_per_capita_scores_to_average),
background_per_capita_score=np.mean(
background_per_capita_scores_to_average
),
ungrouped_per_capita_score=np.mean(ungrouped_per_capita_scores_to_average),
simulation_outcomes=tuple(simulation_outcomes),
focal_is_resident=scenario_config.focal_is_resident,
api_type=args.api_type,
model=args.model_name,
embedder=args.embedder_name,
disable_language_model=args.disable_language_model,
exclude_from_elo_calculation=args.exclude_from_elo_calculation,
)
scenario_json_filename = (
f'{args.agent_name}__{args.model_name}__'
f'{args.embedder_name}__only_{args.scenario_name}.json'
).replace('/', '_')
json_str_ = scenario_result.to_json()
with open(scenario_json_filename, 'a', encoding='utf-8') as f:
f.write(json_str_)
scenario_result = logging_lib.ScenarioResult(
scenario=args.scenario_name,
repetition_idx=repetition_idx,
focal_agent=args.agent_name,
background_agent=scenario_config.background_agent_module,
focal_per_capita_score=focal_per_capita_score,
background_per_capita_score=background_per_capita_score,
ungrouped_per_capita_score=ungrouped_per_capita_score,
simulation_outcome=outcome,
focal_is_resident=scenario_config.focal_is_resident,
api_type=args.api_type,
model=args.model_name,
embedder=args.embedder_name,
disable_language_model=args.disable_language_model,
exclude_from_elo_calculation=args.exclude_from_elo_calculation,
)
scenario_json_filename = (
f'{args.agent_name}__{args.model_name}__'
f'{args.embedder_name}__only__{args.scenario_name}__{repetition_idx}'
'.json'
).replace('/', '_')
json_str_ = scenario_result.to_json()
with open(scenario_json_filename, 'a', encoding='utf-8') as f:
f.write(json_str_)
24 changes: 10 additions & 14 deletions examples/modular/utils/logging_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,17 @@ class SimulationOutcome:
@dataclasses.dataclass(frozen=True, kw_only=True)
class ScenarioResult:
"""Result from testing a single agent on several repetitions of a scenario.
Attributes:
scenario: The name of the scenario.
repetition_idx: The index of the repetition (i.e. the seed).
focal_agent: The name of the agent that is being tested in the focal slots.
background_agent: The name of the agent used in the background player slots.
focal_per_capita_score: The per capita score of the focal agent.
background_per_capita_score: The per capita score of the background agent.
ungrouped_per_capita_score: The per capita score of the focal agent,
averaged over all players (both residents and visitors).
simulation_outcomes: A tuple of SimulationOutcomes, one for each repetition
of the scenario.
simulation_outcome: A SimulationOutcome object.
focal_is_resident: Whether the focal agent is a resident or a visitor.
api_type: The API type used for the simulation
(e.g. `google_aistudio_model`, `mistral`, `openai`, etc).
Expand All @@ -64,6 +64,7 @@ class ScenarioResult:
"""

scenario: str
repetition_idx: int

focal_agent: str
background_agent: str
Expand All @@ -72,9 +73,7 @@ class ScenarioResult:
background_per_capita_score: float
ungrouped_per_capita_score: float

simulation_outcomes: tuple[SimulationOutcome, ...] = dataclasses.field(
repr=False
)
simulation_outcome: SimulationOutcome = dataclasses.field(repr=False)

focal_is_resident: bool

Expand All @@ -87,16 +86,13 @@ class ScenarioResult:

def to_json(self) -> str:
"""Encode this dataclass as a string to serialize as a json file."""
simulation_outcome_dicts = []
for outcome in self.simulation_outcomes:
outcome_dict = dataclasses.asdict(outcome)
outcome_dict['resident_scores'] = dict(outcome_dict['resident_scores'])
outcome_dict['visitor_scores'] = dict(outcome_dict['visitor_scores'])
outcome_dict['metadata'] = dict(outcome_dict['metadata'])
simulation_outcome_dicts.append(outcome_dict)
outcome_dict = dataclasses.asdict(self.simulation_outcome)
outcome_dict['resident_scores'] = dict(outcome_dict['resident_scores'])
outcome_dict['visitor_scores'] = dict(outcome_dict['visitor_scores'])
outcome_dict['metadata'] = dict(outcome_dict['metadata'])

self_as_dict = dataclasses.asdict(self)
self_as_dict['simulation_outcomes'] = tuple(simulation_outcome_dicts)
self_as_dict['simulation_outcome'] = outcome_dict

return json.dumps(self_as_dict, indent=2)

Expand Down

0 comments on commit 7cbf802

Please sign in to comment.