Skip to content
This repository has been archived by the owner on Nov 12, 2024. It is now read-only.

Commit

Permalink
ruff
Browse files Browse the repository at this point in the history
  • Loading branch information
hsaeed3 committed Oct 1, 2024
1 parent 5d0228d commit d1764d7
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 53 deletions.
60 changes: 32 additions & 28 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "zyx"
version = "1.0.3"
version = "1.0.4"
description = "A hyper-fast, fun, quality-of-life focused & genuinely useful LLM toolkit. Inspired by Marvin-AI. Built on LiteLLM, Instructor & Qdrant."
authors = ["Hammad Saeed <hvmmad@gmail.com>"]
readme = "readme.md"
Expand Down
2 changes: 2 additions & 0 deletions zyx/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,8 @@ def __init_client__(self):
Initializes the specified client library.
"""

print(self.provider)

if self.provider == "openai":
from openai import OpenAI

Expand Down
113 changes: 89 additions & 24 deletions zyx/resources/completions/agents/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
from enum import Enum

from ....client import Client, InstructorMode, ToolType
from ....resources.data.chunk import chunk
from ....resources.completions.base.generate import generate
from ....lib.types.document import Document
from ....lib.utils.logger import get_logger

logger = get_logger("scrape")


def web_search(query: str, max_results: Optional[int] = 5) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -54,15 +59,21 @@ class ScrapeWorkflow(BaseModel):
evaluation: Optional[StepResult] = None


class QueryList(BaseModel):
queries: List[str]


def scrape(
query: str,
query: str, # Single query input
num_queries: int = 5, # Number of queries to generate
max_results: Optional[int] = 5,
workers: int = 5,
model: str = "gpt-4o-mini",
client: Literal["openai", "litellm"] = "openai",
api_key: Optional[str] = None,
base_url: Optional[str] = None,
mode: InstructorMode = "tool_call",
response_model: Optional[BaseModel] = None,
max_retries: int = 3,
temperature: float = 0.5,
run_tools: Optional[bool] = False,
Expand All @@ -73,10 +84,11 @@ def scrape(
**kwargs,
) -> Document:
"""
Scrapes the web for topics & content about a query, generates a well-written summary, and returns a Document object.
Scrapes the web for topics & content about multiple queries, generates a well-written summary, and returns a Document object.
Parameters:
query: The search query.
query: The initial search query.
num_queries: Number of queries to generate based on the initial query.
max_results: Maximum number of search results to process.
workers: Number of worker threads to use.
model: The model to use for completion.
Expand All @@ -89,30 +101,48 @@ def scrape(
run_tools: Whether to run tools for completion.
tools: The tools to use for completion.
Returns:
A Document object containing the summary and metadata.
"""
import threading
from bs4 import BeautifulSoup

client = Client(
completion_client = Client(
api_key=api_key, base_url=base_url, provider=client, verbose=verbose
)

workflow = ScrapeWorkflow(query=query)
# Generate multiple queries based on the initial query
query_list = generate(
target=QueryList,
instructions=f"Generate {num_queries} related search queries based on the initial query: '{query}'",
n=1,
model=model,
api_key=api_key,
base_url=base_url,
client=client,
verbose=verbose,
).queries

workflow = ScrapeWorkflow(query=query) # Use the initial query for workflow

if verbose:
print(f"Starting scrape for query: {query}")
print(f"Starting scrape for queries: {query_list}")

# Step 1: Use web_search() to get search results
workflow.current_step = ScrapingStep.SEARCH
search_results = web_search(query, max_results=max_results)
workflow.search_results = search_results
urls = [result["href"] for result in search_results if "href" in result]
all_search_results = []
all_urls = []

if verbose:
print(f"Found {len(urls)} URLs")
for query in query_list:
# Step 1: Use web_search() to get search results
workflow.current_step = ScrapingStep.SEARCH
search_results = web_search(query, max_results=max_results)
all_search_results.extend(search_results)
urls = [result["href"] for result in search_results if "href" in result]
all_urls.extend(urls)

if verbose:
print(f"Found {len(urls)} URLs for query: {query}")

workflow.search_results = all_search_results

# Step 2: Define a function to fetch and parse content from a URL
def fetch_content(url: str) -> str:
Expand All @@ -121,8 +151,6 @@ def fetch_content(url: str) -> str:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Extract text content from the page
# You might want to refine this to get more meaningful content
texts = soup.find_all(text=True)
visible_texts = filter(tag_visible, texts)
content = " ".join(t.strip() for t in visible_texts)
Expand Down Expand Up @@ -153,7 +181,7 @@ def tag_visible(element):
workflow.current_step = ScrapingStep.FETCH
contents = []
with ThreadPoolExecutor(max_workers=workers) as executor:
future_to_url = {executor.submit(fetch_content, url): url for url in urls}
future_to_url = {executor.submit(fetch_content, url): url for url in all_urls}
for future in future_to_url:
content = future.result()
if content:
Expand All @@ -168,13 +196,35 @@ def tag_visible(element):
workflow.current_step = ScrapingStep.SUMMARIZE
combined_content = "\n\n".join(contents)

# Optionally, you can chunk the content if it's too large
# For now, we'll assume it's manageable
# Step 4.5: If Response Model is provided, return straight away
if response_model:
return completion_client.completion(
messages=[
{"role": "user", "content": "What is our current scraped content?"},
{"role": "assistant", "content": combined_content},
{
"role": "user",
"content": "Only extract the proper content from the response & append into the response model.",
},
],
model=model,
response_model=response_model,
mode=mode,
max_retries=max_retries,
temperature=temperature,
run_tools=run_tools,
tools=tools,
parallel_tool_calls=parallel_tool_calls,
tool_choice=tool_choice,
verbose=verbose,
**kwargs,
)

# Step 5: Use the completion function to generate a summary
# Prepare the prompt
system_prompt = (
"You are an AI assistant that summarizes information gathered from multiple web pages. "
"Ensure that all links parsed are from reputable sources and do not infringe any issues. "
"Provide a comprehensive, well-written summary of the key points related to the following query."
)
user_prompt = f"Query: {query}\n\nContent:\n{combined_content}\n\nSummary:"
Expand All @@ -185,7 +235,7 @@ def tag_visible(element):
]

# Call the completion function
response = client.completion(
response = completion_client.completion(
messages=messages,
model=model,
mode=mode,
Expand All @@ -208,10 +258,11 @@ def tag_visible(element):
evaluation_prompt = (
f"Evaluate the quality and relevance of the following summary for the query: '{query}'\n\n"
f"Summary:\n{summary}\n\n"
"Ensure that all links parsed are from reputable sources and do not infringe any issues. "
"Provide an explanation of your evaluation and determine if the summary is successful or needs refinement."
)

evaluation_response = client.completion(
evaluation_response = completion_client.completion(
messages=[
{"role": "system", "content": "You are an expert evaluator of summaries."},
{"role": "user", "content": evaluation_prompt},
Expand All @@ -232,10 +283,11 @@ def tag_visible(element):
f"The previous summary for the query '{query}' needs improvement.\n\n"
f"Original summary:\n{summary}\n\n"
f"Evaluation feedback:\n{evaluation_response.explanation}\n\n"
"Ensure that all links parsed are from reputable sources and do not infringe any issues. "
"Please provide an improved and refined summary addressing the feedback."
)

refined_response = client.completion(
refined_response = completion_client.completion(
messages=[
{
"role": "system",
Expand All @@ -260,10 +312,10 @@ def tag_visible(element):
content=summary,
metadata={
"query": query,
"urls": urls,
"urls": all_urls,
"model": model,
"client": client,
"workflow": workflow.dict(),
"workflow": workflow.model_dump(),
},
)

Expand All @@ -274,9 +326,22 @@ def tag_visible(element):
if __name__ == "__main__":
result_document = scrape(
query="Latest advancements in renewable energy",
num_queries=5,
max_results=5,
workers=5,
verbose=True,
)
print("Final Document:")
print(result_document.content)

class YoutubeLinks(BaseModel):
links: list[str] = Field(description="A list of youtube links")

result_document = scrape(
query="Latest advancements in renewable energy",
num_queries=5,
max_results=5,
workers=5,
verbose=True,
response_model=YoutubeLinks,
)

0 comments on commit d1764d7

Please sign in to comment.