-
Notifications
You must be signed in to change notification settings - Fork 54
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add case test_metric_longhorn_backup #2145
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ | |
from kubernetes.stream import stream | ||
from prometheus_client.parser import text_string_to_metric_families | ||
|
||
from common import client, core_api, pod, volume_name # NOQA | ||
from common import client, core_api, pod, volume_name, batch_v1_api # NOQA | ||
|
||
from common import crash_engine_process_with_sigkill | ||
from common import delete_replica_processes | ||
|
@@ -35,6 +35,25 @@ | |
from common import DEFAULT_DISK_PATH | ||
from common import Gi | ||
|
||
from backupstore import set_random_backupstore # NOQA | ||
from common import create_recurring_jobs | ||
from common import check_recurring_jobs | ||
from common import wait_for_cron_job_count | ||
from common import create_backup | ||
from common import wait_for_backup_count | ||
from common import delete_backup_volume | ||
|
||
RECURRING_JOB_NAME = "recurring-test" | ||
TASK = "task" | ||
GROUPS = "groups" | ||
CRON = "cron" | ||
RETAIN = "retain" | ||
BACKUP = "backup" | ||
CONCURRENCY = "concurrency" | ||
LABELS = "labels" | ||
DEFAULT = "default" | ||
SCHEDULE_1MIN = "* * * * *" | ||
|
||
# The dictionaries use float type of value because the value obtained from | ||
# prometheus_client is in float type. | ||
# https://github.com/longhorn/longhorn-tests/pull/1531#issuecomment-1833349994 | ||
|
@@ -138,6 +157,21 @@ def examine_metric_value(found_metric, metric_labels, expected_value=None): | |
assert found_metric.value >= 0.0 | ||
|
||
|
||
def wait_for_metric_sum_on_all_nodes(client, core_api, metric_name, metric_labels, expected_value): # NOQA | ||
for _ in range(RETRY_COUNTS): | ||
time.sleep(RETRY_INTERVAL) | ||
|
||
try: | ||
check_metric_sum_on_all_nodes(client, core_api, metric_name, | ||
metric_labels, expected_value) | ||
return | ||
except AssertionError: | ||
continue | ||
|
||
check_metric_sum_on_all_nodes(client, core_api, metric_name, | ||
metric_labels, expected_value) | ||
|
||
|
||
def check_metric_sum_on_all_nodes(client, core_api, metric_name, expected_labels, expected_value=None): # NOQA | ||
# Initialize total_metrics to store the sum of the metric values. | ||
total_metrics = {"labels": defaultdict(None), "value": 0.0} | ||
|
@@ -440,12 +474,12 @@ def test_metric_longhorn_snapshot_actual_size_bytes(client, core_api, volume_nam | |
|
||
When 1 snapshot is created by user | ||
And 1 snapshot is created by system | ||
Then has a metric longhorn_snapshot_actual_size_bytes value equals to the | ||
size of the user created snapshot, | ||
Then has a metric longhorn_snapshot_actual_size_bytes value | ||
equals to the size of the user created snapshot, | ||
and volume label is the volume name | ||
and user_created label is true | ||
And has a metric longhorn_snapshot_actual_size_bytes value equals to the | ||
size of the system created snapshot, | ||
And has a metric longhorn_snapshot_actual_size_bytes value | ||
equals to the size of the system created snapshot, | ||
and volume label is the volume name | ||
and user_created label is false | ||
|
||
|
@@ -615,3 +649,126 @@ def test_node_metrics(client, core_api): # NOQA | |
wait_for_node_update(client, lht_hostId, "allowScheduling", False) | ||
check_metric_with_condition(core_api, "longhorn_node_status", | ||
metric_labels, 0.0) | ||
|
||
|
||
def test_metric_longhorn_backup(set_random_backupstore, client, core_api, batch_v1_api, volume_name): # NOQA | ||
""" | ||
Scenario: test metric longhorn_backup_actual_size_bytes and | ||
longhorn_backup_state | ||
|
||
Issue: https://github.com/longhorn/longhorn/issues/9429 | ||
|
||
Given a volume | ||
|
||
When a backup is created by user | ||
Then has a metric longhorn_backup_actual_size_bytes value | ||
equals to the size of the backup, | ||
and volume label is the volume name | ||
and recurring_job label is empty | ||
And has a metric longhorn_backup_state value equals to 3 (Completed), | ||
and volume label is the volume name | ||
and recurring_job label is empty | ||
|
||
When a recurring backup job is created | ||
Then should have a metric longhorn_backup_actual_size_bytes value | ||
equals to the size of the backup, | ||
and volume label is the volume name | ||
and recurring_job label is the job name | ||
And should have a metric longhorn_backup_state | ||
value equals to 3 (Completed), | ||
and volume label is the volume name | ||
and recurring_job label is the job name | ||
""" | ||
self_hostId = get_self_host_id() | ||
|
||
# create a volume and attach it to a node. | ||
volume_size = 50 * Mi | ||
client.create_volume(name=volume_name, | ||
numberOfReplicas=1, | ||
size=str(volume_size)) | ||
volume = wait_for_volume_detached(client, volume_name) | ||
volume.attach(hostId=self_hostId) | ||
volume = wait_for_volume_healthy(client, volume_name) | ||
Comment on lines
+684
to
+691
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Consider parameterizing volume creation The volume creation process is hardcoded. Consider parameterizing the volume size and number of replicas to make the test more flexible and reusable. Example: def create_test_volume(client, name, size=50*Mi, replicas=1):
client.create_volume(name=name, numberOfReplicas=replicas, size=str(size))
volume = wait_for_volume_detached(client, name)
volume.attach(hostId=get_self_host_id())
return wait_for_volume_healthy(client, name)
volume = create_test_volume(client, volume_name) |
||
|
||
# create the user backup. | ||
data_size = 10 * Mi | ||
backup_data = {'pos': 0, | ||
'len': data_size, | ||
'content': generate_random_data(data_size)} | ||
write_volume_data(volume, backup_data) | ||
create_backup(client, volume_name) | ||
bv = client.by_id_backupVolume(volume_name) | ||
wait_for_backup_count(bv, 1) | ||
|
||
# get the backup size. | ||
backup_size = 0 | ||
backups = bv.backupList().data | ||
for backup in backups: | ||
if backup['snapshotName'] == "volume-head": | ||
continue | ||
|
||
backup_size = int(backup['size']) | ||
assert backup_size > 0 | ||
Comment on lines
+706
to
+711
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potential issue with backup size assignment In the loop iterating over backups, the variable
Comment on lines
+704
to
+711
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Refactor duplicate code for getting backup size The code for obtaining the backup size is duplicated for user backups and recurring backups. Consider refactoring this into a helper function to improve maintainability and reduce redundancy. Here's a suggested helper function: def get_backup_size(backup_volume):
backups = backup_volume.backupList().data
for backup in backups:
if backup['snapshotName'] == "volume-head":
continue
return int(backup['size'])
return 0
# Then use it like this:
backup_size = get_backup_size(bv)
assert backup_size > 0, "Backup size should be greater than 0" Also applies to: 751-758 |
||
|
||
Comment on lines
+704
to
+712
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Refactor to eliminate code duplication when calculating backup sizes The code for obtaining Here's an example of a helper function: def get_backup_size(backup_volume):
backups = backup_volume.backupList().data
for backup in backups:
if backup['snapshotName'] == "volume-head":
continue
return int(backup['size'])
return 0 You can then use this function to obtain the backup sizes: backup_size = get_backup_size(bv) Also applies to: 749-757 |
||
# assert the metric values for the user backup. | ||
user_backup_metric_labels = { | ||
"volume": volume_name, | ||
"recurring_job": "", | ||
} | ||
wait_for_metric_sum_on_all_nodes(client, core_api, | ||
"longhorn_backup_actual_size_bytes", | ||
user_backup_metric_labels, | ||
backup_size) | ||
|
||
wait_for_metric_sum_on_all_nodes(client, core_api, | ||
"longhorn_backup_state", | ||
user_backup_metric_labels, | ||
3) | ||
|
||
# delete the existing backup before creating a recurring backup job. | ||
delete_backup_volume(client, volume_name) | ||
Comment on lines
+728
to
+729
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Consider adding error handling for backup volume deletion The Example: try:
delete_backup_volume(client, volume_name)
except Exception as e:
pytest.fail(f"Failed to delete backup volume: {str(e)}") There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @c3y1huang In my test case Thanks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After longhorn/longhorn-manager#3216 was merged, my test case passed, and I did not observe any potential data caching issues when using the same volume to apply a backup recurring job to check the Longhorn backup metrics. |
||
|
||
# create a recurring backup job. | ||
recurring_jobs = { | ||
RECURRING_JOB_NAME: { | ||
TASK: BACKUP, | ||
GROUPS: [DEFAULT], | ||
CRON: SCHEDULE_1MIN, | ||
RETAIN: 1, | ||
CONCURRENCY: 1, | ||
LABELS: {}, | ||
}, | ||
} | ||
create_recurring_jobs(client, recurring_jobs) | ||
check_recurring_jobs(client, recurring_jobs) | ||
wait_for_cron_job_count(batch_v1_api, 1) | ||
|
||
# wait for the recurring backup job to run. | ||
time.sleep(60) | ||
bv = client.by_id_backupVolume(volume_name) | ||
wait_for_backup_count(bv, 1) | ||
Comment on lines
+746
to
+749
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Consider replacing sleep with a more robust waiting mechanism Using a fixed Here's a suggested approach: def wait_for_backup_completion(client, volume_name, timeout=300, interval=2):
start_time = time.time()
while time.time() - start_time < timeout:
bv = client.by_id_backupVolume(volume_name)
if len(bv.backupList().data) > 0:
return True
time.sleep(interval)
raise TimeoutError(f"Backup for volume {volume_name} did not complete within {timeout} seconds")
# Replace the sleep and subsequent lines with:
wait_for_backup_completion(client, volume_name)
bv = client.by_id_backupVolume(volume_name) |
||
|
||
# get the recurring backup size. | ||
recurring_backup_size = 0 | ||
backups = bv.backupList().data | ||
for backup in backups: | ||
if backup['snapshotName'] == "volume-head": | ||
continue | ||
|
||
recurring_backup_size = int(backup['size']) | ||
assert recurring_backup_size > 0 | ||
Comment on lines
+754
to
+759
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potential issue with recurring backup size assignment Similar to the user backup size, in the loop iterating over backups, |
||
|
||
# assert the metric values for the recurring backup. | ||
recurring_backup_metric_labels = { | ||
"volume": volume_name, | ||
"recurring_job": RECURRING_JOB_NAME, | ||
} | ||
wait_for_metric_sum_on_all_nodes(client, core_api, | ||
"longhorn_backup_actual_size_bytes", | ||
recurring_backup_metric_labels, | ||
recurring_backup_size) | ||
|
||
wait_for_metric_sum_on_all_nodes(client, core_api, | ||
"longhorn_backup_state", | ||
recurring_backup_metric_labels, | ||
3) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Consider enhancing the wait_for_metric_sum_on_all_nodes function
The function is well-implemented, but consider the following improvements:
RETRY_COUNTS
andRETRY_INTERVAL
to make the function more flexible.Example implementation:
This implementation provides more flexibility and better handles long-running waits.