diff --git a/search_gov_crawler/requirements.txt b/search_gov_crawler/requirements.txt index a4c4f75..32df7f6 100644 --- a/search_gov_crawler/requirements.txt +++ b/search_gov_crawler/requirements.txt @@ -10,3 +10,5 @@ scrapy-playwright==0.0.41 scrapyd==1.5.0 scrapyd-client==2.0.0 scrapydweb @ git+https://github.com/GSA/searchgov-scrapydweb +spidermon [monitoring] == 1.22.0 + diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja new file mode 100644 index 0000000..656c94b --- /dev/null +++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/base.jinja @@ -0,0 +1,27 @@ + + + + + + + + + + +
+
+ {% block page_content %}{% endblock %} +
+ +
+ + + + diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css new file mode 100644 index 0000000..2f13050 --- /dev/null +++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/email.css @@ -0,0 +1,87 @@ +/*-------------------------- + Reset +--------------------------*/ +html{font-size:100%} +button,input,select,textarea{margin:0;font-size:100%;vertical-align:middle} +button,input{line-height:normal} +textarea{overflow:auto;vertical-align:top} +body{margin:0;font-family:"Segoe UI","Helvetica Neue",Helvetica,Arial,sans-serif;font-size:13px;line-height:20px;color:#333;background-color:#fff} +a{color:#0063ca;text-decoration:none} +a:hover{color:#003e7e;text-decoration:underline} +p{margin:0 0 10px} +small{font-size:85%} +strong{font-weight:bold} +em{font-style:italic} +cite{font-style:normal} +h1,h2,h3,h4,h5,h6{margin:10px 0;font-family:inherit;font-weight:bold;line-height:1;color:inherit;} +h1 small,h2 small,h3 small,h4 small,h5 small,h6 small{font-weight:normal;line-height:1;color:#999} +h1{font-size:36px;line-height:40px} +h2{font-size:30px;line-height:40px} +h3{font-size:24px;line-height:40px} +h4{font-size:18px;line-height:20px} +h5{font-size:14px;line-height:20px} +h6{font-size:12px;line-height:20px} +h1 small{font-size:24px} +h2 small{font-size:18px} +h3 small{font-size:14px} +h4 small{font-size:14px} +ul,ol{padding:0;margin:0 0 10px 25px} +ul ul,ul ol,ol ol,ol ul{margin-bottom:0} +li{line-height:20px} +hr{margin:20px 0;border:0;border-top:1px solid #eee;border-bottom:1px solid #fff} +td{margin:0;padding:0;} +code,pre{padding:0 3px 2px;font-family:Monaco,Menlo,Consolas,"Courier New",monospace;font-size:11px;color:#333;border-radius:3px} +code{padding:2px 4px;color:#d14;background-color:#f7f7f9;border:1px solid #e1e1e8} +pre{display:block;padding:9.5px;margin:0 0 10px;font-size:12px;line-height:20px;white-space:pre;white-space:pre-wrap;background-color:#f5f5f5;border:1px solid #ccc;border:1px solid rgba(0,0,0,0.15);border-radius:4px} +pre code{padding:0;color:inherit;background-color:transparent;border:0} +table{max-width:100%;background-color:transparent;border-collapse:collapse;border-spacing:0} + +/*-------------------------- + Buttons +--------------------------*/ +.btn{background:#999;border-radius:6px;border:0;color:#fff;cursor:pointer;display:inline-block;margin:2px 0;padding:12px 30px 14px;} +.btn{font-size: 16px;} +.btn:hover{background:#888;text-decoration: none;color:#fff;} +.btn:active, +.btn:focus{background:#777;text-decoration: none;color:#fff;} +.btn-blue{background-color:#09d} +.btn-blue:hover{background-color:#0ae} +.btn-blue:active, +.btn-blue:focus{background-color:#09d} +.btn-green{background-color:#2b4} +.btn-green:hover{background-color:#3c5} +.btn-green:active, +.btn-green:focus{background-color:#3c5} +.btn-red{background-color:#c22} +.btn-red:hover{background-color:#d33} +.btn-red:active, +.btn-red:focus{background-color:#d33} +.btn-sm{border-radius:4px;padding:8px 10px;font-size: 14px;} + +/*-------------------------- + Alerts +--------------------------*/ +.alert {padding: 15px;margin-bottom: 20px;border: 1px solid transparent;border-radius: 4px;} +.alert-warning {color: #8a6d3b;background-color: #fcf8e3;border-color: #faebcc;} + +/*-------------------------- + Labels +--------------------------*/ +.label {border-radius: 3px;font-size: 12px;font-weight: bold;color: #ffffff;padding: 2px 6px 2px 6px;line-height: 20px;height: 20px;} +.label-gray {background-color: #999;} +.label-blue {background-color: #3a87ad;} + +/*-------------------------- + Badge +--------------------------*/ +.badge {padding: 2px 9px 2px 9px;border-radius: 9px;font-size: 12px;font-weight: 700;line-height: 14px;color: #fff;vertical-align: baseline;white-space: nowrap;background-color: #999;text-align: center;} +.badge-green {background-color: #468847} +.badge-red {background-color: #b94a48} +/*-------------------------- + Icons +--------------------------*/ +.icon,.icon-big {display:inline-block;} +.icon {width:34px;height:34px;} +.icon-big {width:140px;height:140px;} + + diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja new file mode 100644 index 0000000..2e692d3 --- /dev/null +++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/medium.jinja @@ -0,0 +1,3 @@ +{% extends 'reports/email/bases/report/base.jinja' %} + +{% block page_class %}report-medium{% endblock %} diff --git a/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css new file mode 100644 index 0000000..bd1e39d --- /dev/null +++ b/search_gov_crawler/search_gov_spiders/actions/reports/email/bases/report/report.css @@ -0,0 +1,27 @@ +/*-------------------------- + Page +--------------------------*/ +body {background-color: #eee;font-size: 14px;} +hr {margin: 0;padding: 0;} +tr.odd td {background-color: #f9f9f9;} + +/*-------------------------- + Report +--------------------------*/ +table.report-container {width: 100%;} +table.report-container td {padding: 40px 20px;} +.report {background-color: #ffffff;margin: 0 auto;border-radius: 6px;box-shadow: 0 2px 10px 0 rgba(0,0,0,0.3);text-align: left;} +.report-big {width:1000px;} +.report-medium {width:730px;} +.report-small {width:500px;} +.report-title {background-color: #444;color: #fff;border-top-left-radius: 6px;border-top-right-radius: 6px;} +.report-title td {margin: 0;vertical-align: top;} +.report-title td.title {padding: 16px 30px;} +.report-title td.title h1 {margin: 0;padding: 0;} +.report-title td.button {padding: 18px 16px 0 0;} +.report-title td.button {text-align: right;width: 130px;} +.report-section {margin: 0;padding: 25px 30px;} +.report-section h2 {margin: 0 0 20px 0;padding: 0 0 12px 0;line-height: 20px;border-bottom: 1px solid #f4f4f4;} +.report-section h3 {margin: 25px 0 5px 0;line-height: 24px;} +.report-section h4 {margin: 0 0 2px 0;} +.report-footer {text-align: center;padding: 20px 10px 5px 10px;color: #cdcdcd;font-size: 14px;} \ No newline at end of file diff --git a/search_gov_crawler/search_gov_spiders/actions/results.css b/search_gov_crawler/search_gov_spiders/actions/results.css new file mode 100644 index 0000000..0e44d0b --- /dev/null +++ b/search_gov_crawler/search_gov_spiders/actions/results.css @@ -0,0 +1,154 @@ +/*-------------------------- + Summary +--------------------------*/ +table.table-summary td { + margin: 0; + padding: 0; + vertical-align: top; +} +table.table-summary td.values { + width: 470px; + /*background-color: #0ff;/**/ + padding: 18px 0; +} +table.table-summary td.banner { + width: 200px; + text-align: center; + font-size: 30px; + font-weight: bold; + /*background-color: #ff0;/**/ + padding-bottom: 20px; +} +table.table-summary-values td.name { + padding: 2px 0; + vertical-align: middle; + text-align: right; + width: 160px; + font-weight: bold; +} +table.table-summary-values td.value { + padding: 1px 0 0 17px; + width: 310px; + vertical-align: middle; +} +table.table-summary-values td.value .label { + font-size: 14px; +} +table.table-summary-values td.separator { + padding: 10px 0; +} + +/*-------------------------- + Monitors +--------------------------*/ +table.table-monitor { + margin: 10px 0 30px 0; + border-bottom: 1px solid #ddd; +} +table.table-monitor tr { + height: 42px; +} +table.table-monitor td { + padding: 10px 0px; + margin: 0; + border-top: 1px solid #ddd; +} +table.table-monitor td.name { + font-size: 18px; + line-height: 20px; + font-weight: 700; + color: #666; + width: 544px; + padding-left:15px +} +table.table-monitor td.name a { + text-decoration: underline; +} +table.table-monitor td.status { + text-align: center; + width: 80px; +} +table.table-monitor td.icon { + padding: 4px 5px 4px 5px; + text-align: center; + width: 46px; +} + +/*-------------------------- + Labels +--------------------------*/ +.label-FAIL { + background-color: #b94a48; +} +.label-ERROR { + background-color: #ff0000; +} + +/*-------------------------- + Icons +--------------------------*/ +.icon-OK { + background-image:url(http://email-templates.scrapinghub.com.s3.amazonaws.com/spidermon/icon_ok_34.png); +} +.icon-FAIL { + background-image:url(http://email-templates.scrapinghub.com.s3.amazonaws.com/spidermon/icon_fail_34.png); +} +.icon-ERROR { + background-image:url(http://email-templates.scrapinghub.com.s3.amazonaws.com/spidermon/icon_error_34.png); +} +.icon-big-OK { + background-image:url(http://email-templates.scrapinghub.com.s3.amazonaws.com/spidermon/icon_ok_140.png); +} +.icon-big-FAIL { + background-image:url(http://email-templates.scrapinghub.com.s3.amazonaws.com/spidermon/icon_fail_140.png); +} + +/*-------------------------- + OK / FAIL / ERROR +--------------------------*/ +.text-OK { + color: #5cb85c; +} +a.link-FAIL:hover, +a.link-FAIL:visited, +a.link-FAIL:active, +a.link-FAIL, +.text-FAIL { + color: #b94a48; +} +a.link-ERROR:hover, +a.link-ERROR:visited, +a.link-ERROR:active, +a.link-ERROR, +.text-ERROR { + color: #ff0000; +} + +/*-------------------------- + Monitor Failures +--------------------------*/ +table.report-failure { + margin: 30px 0 30px 0; +} +table.report-failure td { + margin: 0; + padding: 0; +} +table.report-failure td.label { + text-align: right; +} +.description { + color: #888; + font-size: 13px; + margin: 0 0 2px 0; +} +table.report-failure pre { + margin-top: 8px; +} +.highlighted { + display: inline-block; + background-color: #fff5b9; + padding: 1px 6px; + font-style: italic; + margin: 5px 0 2px 0; +} diff --git a/search_gov_crawler/search_gov_spiders/actions/results.jinja b/search_gov_crawler/search_gov_spiders/actions/results.jinja new file mode 100644 index 0000000..7299b3b --- /dev/null +++ b/search_gov_crawler/search_gov_spiders/actions/results.jinja @@ -0,0 +1,303 @@ +{% extends 'reports/email/bases/report/medium.jinja' %} + +{#=============================================== + render_dash_link +===============================================#} +{% macro render_dash_link() %} + + See job + +{% endmacro %} + +{#=============================================== + render_header_data_value +===============================================#} +{% macro render_header_data_value(value, classes=None) %} + {% if value %} + {% if classes %} + {{ value }} + {% else %} + {{ value }} + {% endif %} + {% endif %} +{% endmacro %} + +{#=============================================== + render_header_data +===============================================#} +{% macro render_header_data(name, value, classes=None) %} + + {{ name }}: + + {% if value %} + {% if 'Job' == name and show_job_button %} + {{ render_header_data_job_link(value, classes) }} + {% else %} + {{ render_header_data_value(value, classes) }} + {% endif %} + {% else %} + - + {% endif %} + + +{% endmacro %} + +{#=============================================== + render_header_data_job_link +===============================================#} +{% macro render_header_data_job_link(value, classes=None) %} + {% if classes %} + + + {{ value }} + + + {% else %} + + {{ value }} + + {% endif %} +{% endmacro %} + +{#=============================================== + render_header_data_list +===============================================#} +{% macro render_header_data_list(name, values, classes=None) %} + + {{ name }}: + + {% if values %} + {% for value in values %}{{ render_header_data_value(value, classes) }}{% endfor %} + {% else %} + - + {% endif %} + + +{% endmacro %} + +{#=============================================== + render_header_data_separator +===============================================#} +{% macro render_header_data_separator() %} + +
+ +{% endmacro %} + +{#=============================================== + page_styles +===============================================#} +{% block page_styles %} + {% include 'results.css' %} +{% endblock %} + +{#=============================================== + page_content +===============================================#} +{% block page_content %} + {#----------------------------------------------- + TITLE + ------------------------------------------------#} + + + + {% if data.job and show_job_button %} + + {% endif %} + +
+

{{ report_title or 'Report Title' }}

+
+ {{ render_dash_link() }} +
+ {#----------------------------------------------- + SUMMARY + ------------------------------------------------#} +
+ + + {#------------------------ + VALUES + ------------------------#} + + + +
+ + {#------------------------ + JOB OR SPIDER + ------------------------#} + {% set items_count = data.stats.get('item_scraped_count', 0) %} + {% set requests_count = data.stats.get('downloader/request_count', 0) %} + {% set stats_count = data.stats|length %} + {% set passed_monitors_count = result.monitors_passed_results|length %} + {% set failed_monitors_count = result.monitors_failed_results|length %} + {#------------------------ + JOB + ------------------------#} + {% if data.job %} + {% set is_script = data.job.metadata.get('spider').startswith('py:') %} + {% set job_finished_time = data.job.metadata.get('finished_time', 0) %} + {% if not job_finished_time %} + {% set job_finished_time = datetime.datetime.utcnow().strftime('%s')|int*1000 %} + {% endif %} + {% set job_running_time = data.job.metadata.get('running_time', 0) %} + {% set running_time = job_finished_time - job_running_time %} + + {% if is_script %} + {{ render_header_data('Script', data.job.metadata.get('spider')[3:], 'label label-blue') }} + {% else %} + {{ render_header_data('Spider', data.job.metadata.get('spider'), 'label label-blue') }} + {{ render_header_data('Version', data.job.metadata.get('version')) }} + {{ render_header_data('Items', items_count, "badge badge-green") }} + {{ render_header_data('Requests', requests_count, "badge") }} + {% endif %} + {% if show_log_count %} + {% set logs_count = data.job.logs.list()|list|length %} + {% set log_errors = data.job.logs|get_log_errors %} + {% set log_errors_count = log_errors|length %} + {{ render_header_data('Errors', log_errors_count, "badge badge-red") }} + {{ render_header_data('Logs', logs_count, "badge") }} + {% else %} + {% set log_errors_count = data.stats.get('log_count/ERROR', 0) %} + {{ render_header_data('Errors', log_errors_count, "badge badge-red") }} + {% endif %} + {{ render_header_data('Stats', stats_count, "badge") }} + {{ render_header_data('Running Time', running_time|format_time) }} + {#------------------------ + SPIDER + ------------------------#} + {% elif data.spider %} + {% set log_errors_count = data.stats.get('log_count/ERROR', 0) %} + {% set running_time = data.stats.get('finish_time',datetime.datetime.now(tz=datetime.UTC)) - data.stats.get('start_time') %} + + {{ render_header_data('Spider', data.spider.name, 'label label-blue') }} + {{ render_header_data('Items', items_count, "badge badge-green") }} + {{ render_header_data('Requests', requests_count, "badge") }} + {{ render_header_data('Errors', log_errors_count, "badge badge-red") }} + {{ render_header_data('Stats', stats_count, "badge") }} + {{ render_header_data('Running Time', running_time|format_time) }} + {% endif %} + {#------------------------ + BOTH + ------------------------#} + {% if passed_monitors_count or failed_monitors_count %} + {% set monitors = [render_header_data_value(passed_monitors_count, "badge"), render_header_data_value(failed_monitors_count, "badge badge-red")] %} + {{ render_header_data_list('Monitors', monitors) }} + {% else %} + {{ render_header_data('Monitors', None) }} + {% endif %} + {#------------------------ + JOB ONLY + ------------------------#} + {% if data.job %} + {{ render_header_data_separator() }} + {{ render_header_data('Job', data.job.key, classes='label label-blue') }} + {{ render_header_data('State', data.job.metadata.get('state')) }} + {{ render_header_data('Outcome', data.job.metadata.get('close_reason')) }} + {{ render_header_data('Priority', data.job.metadata.get('priority')) }} + {{ render_header_data('Bot Group', data.job.metadata.get('botgroup')) }} + {{ render_header_data_list('Tags', data.job.metadata.get('tags'), 'label label-gray') }} + {% endif %} +
+
+
+ {#----------------------------------------------- + MONITORS + ------------------------------------------------#} +
+
+

Monitors

+ {% if result.monitor_results %} + {% for group in result.monitor_results|groupby('monitor.monitor_name') %} +

{{ group.grouper }}

+ + {% for result in group.list %} + {% if loop.first and result.monitor.monitor_description %} +
{{ result.monitor.monitor_description }}
+ {% endif %} + + + + + + {% endfor %} +
+ {% if result.status != 'OK' %} + {{ result.monitor.method_name }} + {% else %} + {{ result.monitor.method_name }} + {% endif %} + + {% if result.status != 'OK' %} + {{ result.status }} + {% endif %} + + +
+ {% endfor %} + {% else %} +
+ No monitors defined... +
+ {% endif %} +
+ {#----------------------------------------------- + MONITOR FAILURES + ------------------------------------------------#} + {% if result.monitors_failed_results %} +
+
+

Monitor Failures

+ {% for group in result.monitors_failed_results|groupby('monitor.monitor_name') %} + {% for result in group.list %} + + + + + + + + + +
+

{{ result.monitor.name }}

+
+ {{ result.status }} +
+ {% if result.monitor.method_description %} +
{{ result.monitor.method_description }}
+ {% endif %} + {% if result.reason %} +
{{ result.reason }}
+ {% endif %} +
{{ result.error|indent }}
+
+ + {% endfor %} + {% endfor %} +
+ {% endif %} + {#----------------------------------------------- + STATS + ------------------------------------------------#} +
+
+

Stats

+ {% if data.stats %} +
{{ data.stats|pprint }}
+ {% else %} +
+ No stats available... +
+ {% endif %} +
+{% endblock %} diff --git a/search_gov_crawler/search_gov_spiders/monitors.py b/search_gov_crawler/search_gov_spiders/monitors.py new file mode 100644 index 0000000..260dd94 --- /dev/null +++ b/search_gov_crawler/search_gov_spiders/monitors.py @@ -0,0 +1,17 @@ +from pathlib import Path +from spidermon import MonitorSuite +from spidermon.contrib.actions.email.smtp import SendSmtpEmail +from spidermon.contrib.actions.reports.files import CreateFileReport +from spidermon.contrib.scrapy.monitors.monitors import ItemCountMonitor, UnwantedHTTPCodesMonitor, PeriodicItemCountMonitor, PeriodicExecutionTimeMonitor + +class CreateCustomFileReport(CreateFileReport): + template_paths = [Path(__file__).parent / "actions"] + +class PeriodicMonitorSuite(MonitorSuite): + monitors = [ + ItemCountMonitor, UnwantedHTTPCodesMonitor, PeriodicItemCountMonitor, PeriodicExecutionTimeMonitor + ] + + monitors_failed_actions = [ + CreateCustomFileReport, SendSmtpEmail + ] \ No newline at end of file diff --git a/search_gov_crawler/search_gov_spiders/settings.py b/search_gov_crawler/search_gov_spiders/settings.py index 4e95642..3b91404 100644 --- a/search_gov_crawler/search_gov_spiders/settings.py +++ b/search_gov_crawler/search_gov_spiders/settings.py @@ -7,6 +7,9 @@ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html +import os +from datetime import datetime + # Settings for json logging LOG_ENABLED = False JSON_LOGGING_ENABLED = True @@ -69,6 +72,7 @@ EXTENSIONS = { "search_gov_spiders.extensions.json_logging.JsonLogging": -1, "scrapy.extensions.closespider.CloseSpider": 500, + "spidermon.contrib.scrapy.extensions.Spidermon": 600, } CLOSESPIDER_TIMEOUT_NO_ITEM = 50 @@ -111,3 +115,33 @@ "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", } + +now = datetime.now() +date_time = now.today().isoformat() +dirname= os.path.dirname(__file__) +body_html_template = os.path.join(dirname, 'actions', 'results.jinja') + +SPIDERMON_ENABLED = os.environ.get('SPIDERMON_ENABLED', 'False') +SPIDERMON_MIN_ITEMS = 1000 +SPIDERMON_TIME_INTERVAL = 1 # time is in seconds +SPIDERMON_ITEM_COUNT_INCREASE = 100 +SPIDERMON_MAX_EXECUTION_TIME = 86400 +SPIDERMON_UNWANTED_HTTP_CODES_MAX_COUNT = 10 +SPIDERMON_UNWANTED_HTTP_CODES = [400, 407, 429, 500, 502, 503, 504, 523, 540, 541] +SPIDERMON_REPORT_TEMPLATE = "results.jinja" +SPIDERMON_BODY_HTML_TEMPLATE = body_html_template +SPIDERMON_REPORT_CONTEXT = {"report_title": "Spidermon File Report"} +SPIDERMON_REPORT_FILENAME = f"{date_time}_spidermon_file_report.html" +SPIDERMON_EMAIL_SUBJECT = "Spidermon report" +SPIDERMON_EMAIL_SENDER = os.environ.get('SPIDERMON_EMAIL_SENDER') +SPIDERMON_EMAIL_TO = os.environ.get('SPIDERMON_EMAIL_TO') +SPIDERMON_SMTP_HOST = os.environ.get('SPIDERMON_SMTP_HOST') +SPIDERMON_SMTP_PORT = os.environ.get('SPIDERMON_SMTP_PORT') +SPIDERMON_SMTP_USER = os.environ.get('SPIDERMON_SMTP_USER') +SPIDERMON_SMTP_PASSWORD = os.environ.get('SPIDERMON_SMTP_PASSWORD') +SPIDERMON_SMTP_ENFORCE_SSL = False +SPIDERMON_SMTP_ENFORCE_TLS = True + +SPIDERMON_PERIODIC_MONITORS = { + 'search_gov_spiders.monitors.PeriodicMonitorSuite': SPIDERMON_TIME_INTERVAL, +}