forked from hirowatari-s/ExploreSearchSystem
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_arxiv.py
48 lines (41 loc) · 1.33 KB
/
fetch_arxiv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import arxiv
import pandas as pd
import datetime
def fetch_search_result(search_query, within_5years=False):
five_years_ago = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=5*365)
max_results = 50
search = arxiv.Search(
query=search_query,
max_results=max_results * (3 if within_5years else 1),
)
titles = []
absts = []
urls = []
years = []
for result in search.results():
if within_5years and result.published < five_years_ago:
continue
titles.append(result.title)
absts.append(result.summary.replace('\n', ' '))
urls.append(result.entry_id)
years.append(result.published.year)
num_results = len(titles)
keywords = [search_query] * num_results
rankings = list(range(1, num_results + 1))
df = pd.DataFrame(data=dict(
keyword=keywords[:max_results],
site_name=titles[:max_results],
URL=urls[:max_results],
snippet=absts[:max_results],
ranking=rankings[:max_results],
year=years[:max_results],
))
return df
if __name__ == '__main__':
import time
search_str = input("> ")
start = time.time()
df = fetch_search_result(search_str, True)
duration = time.time() - start
print(f"duration: {duration}s")
df.to_csv(search_str + ".csv")