ruff

hsaeed3 · Oct 1, 2024 · d1764d7 · d1764d7
1 parent 5d0228d
commit d1764d7
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 53 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "zyx"
-version = "1.0.3"
+version = "1.0.4"
 description = "A hyper-fast, fun, quality-of-life focused & genuinely useful LLM toolkit. Inspired by Marvin-AI. Built on LiteLLM, Instructor & Qdrant."
 authors = ["Hammad Saeed <hvmmad@gmail.com>"]
 readme = "readme.md"

diff --git a/zyx/client.py b/zyx/client.py
@@ -463,6 +463,8 @@ def __init_client__(self):
         Initializes the specified client library.
         """
 
+        print(self.provider)
+
         if self.provider == "openai":
             from openai import OpenAI
 

diff --git a/zyx/resources/completions/agents/scrape.py b/zyx/resources/completions/agents/scrape.py
@@ -5,7 +5,12 @@
 from enum import Enum
 
 from ....client import Client, InstructorMode, ToolType
+from ....resources.data.chunk import chunk
+from ....resources.completions.base.generate import generate
 from ....lib.types.document import Document
+from ....lib.utils.logger import get_logger
+
+logger = get_logger("scrape")
 
 
 def web_search(query: str, max_results: Optional[int] = 5) -> List[Dict[str, Any]]:
@@ -54,15 +59,21 @@ class ScrapeWorkflow(BaseModel):
     evaluation: Optional[StepResult] = None
 
 
+class QueryList(BaseModel):
+    queries: List[str]
+
+
 def scrape(
-    query: str,
+    query: str,  # Single query input
+    num_queries: int = 5,  # Number of queries to generate
     max_results: Optional[int] = 5,
     workers: int = 5,
     model: str = "gpt-4o-mini",
     client: Literal["openai", "litellm"] = "openai",
     api_key: Optional[str] = None,
     base_url: Optional[str] = None,
     mode: InstructorMode = "tool_call",
+    response_model: Optional[BaseModel] = None,
     max_retries: int = 3,
     temperature: float = 0.5,
     run_tools: Optional[bool] = False,
@@ -73,10 +84,11 @@ def scrape(
     **kwargs,
 ) -> Document:
     """
-    Scrapes the web for topics & content about a query, generates a well-written summary, and returns a Document object.
+    Scrapes the web for topics & content about multiple queries, generates a well-written summary, and returns a Document object.
 
     Parameters:
-        query: The search query.
+        query: The initial search query.
+        num_queries: Number of queries to generate based on the initial query.
         max_results: Maximum number of search results to process.
         workers: Number of worker threads to use.
         model: The model to use for completion.
@@ -89,30 +101,48 @@ def scrape(
         run_tools: Whether to run tools for completion.
         tools: The tools to use for completion.
 
-
     Returns:
         A Document object containing the summary and metadata.
     """
     import threading
     from bs4 import BeautifulSoup
 
-    client = Client(
+    completion_client = Client(
         api_key=api_key, base_url=base_url, provider=client, verbose=verbose
     )
 
-    workflow = ScrapeWorkflow(query=query)
+    # Generate multiple queries based on the initial query
+    query_list = generate(
+        target=QueryList,
+        instructions=f"Generate {num_queries} related search queries based on the initial query: '{query}'",
+        n=1,
+        model=model,
+        api_key=api_key,
+        base_url=base_url,
+        client=client,
+        verbose=verbose,
+    ).queries
+
+    workflow = ScrapeWorkflow(query=query)  # Use the initial query for workflow
 
     if verbose:
-        print(f"Starting scrape for query: {query}")
+        print(f"Starting scrape for queries: {query_list}")
 
-    # Step 1: Use web_search() to get search results
-    workflow.current_step = ScrapingStep.SEARCH
-    search_results = web_search(query, max_results=max_results)
-    workflow.search_results = search_results
-    urls = [result["href"] for result in search_results if "href" in result]
+    all_search_results = []
+    all_urls = []
 
-    if verbose:
-        print(f"Found {len(urls)} URLs")
+    for query in query_list:
+        # Step 1: Use web_search() to get search results
+        workflow.current_step = ScrapingStep.SEARCH
+        search_results = web_search(query, max_results=max_results)
+        all_search_results.extend(search_results)
+        urls = [result["href"] for result in search_results if "href" in result]
+        all_urls.extend(urls)
+
+        if verbose:
+            print(f"Found {len(urls)} URLs for query: {query}")
+
+    workflow.search_results = all_search_results
 
     # Step 2: Define a function to fetch and parse content from a URL
     def fetch_content(url: str) -> str:
@@ -121,8 +151,6 @@ def fetch_content(url: str) -> str:
             response = requests.get(url, headers=headers, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.content, "html.parser")
-            # Extract text content from the page
-            # You might want to refine this to get more meaningful content
             texts = soup.find_all(text=True)
             visible_texts = filter(tag_visible, texts)
             content = " ".join(t.strip() for t in visible_texts)
@@ -153,7 +181,7 @@ def tag_visible(element):
     workflow.current_step = ScrapingStep.FETCH
     contents = []
     with ThreadPoolExecutor(max_workers=workers) as executor:
-        future_to_url = {executor.submit(fetch_content, url): url for url in urls}
+        future_to_url = {executor.submit(fetch_content, url): url for url in all_urls}
         for future in future_to_url:
             content = future.result()
             if content:
@@ -168,13 +196,35 @@ def tag_visible(element):
     workflow.current_step = ScrapingStep.SUMMARIZE
     combined_content = "\n\n".join(contents)
 
-    # Optionally, you can chunk the content if it's too large
-    # For now, we'll assume it's manageable
+    # Step 4.5: If Response Model is provided, return straight away
+    if response_model:
+        return completion_client.completion(
+            messages=[
+                {"role": "user", "content": "What is our current scraped content?"},
+                {"role": "assistant", "content": combined_content},
+                {
+                    "role": "user",
+                    "content": "Only extract the proper content from the response & append into the response model.",
+                },
+            ],
+            model=model,
+            response_model=response_model,
+            mode=mode,
+            max_retries=max_retries,
+            temperature=temperature,
+            run_tools=run_tools,
+            tools=tools,
+            parallel_tool_calls=parallel_tool_calls,
+            tool_choice=tool_choice,
+            verbose=verbose,
+            **kwargs,
+        )
 
     # Step 5: Use the completion function to generate a summary
     # Prepare the prompt
     system_prompt = (
         "You are an AI assistant that summarizes information gathered from multiple web pages. "
+        "Ensure that all links parsed are from reputable sources and do not infringe any issues. "
         "Provide a comprehensive, well-written summary of the key points related to the following query."
     )
     user_prompt = f"Query: {query}\n\nContent:\n{combined_content}\n\nSummary:"
@@ -185,7 +235,7 @@ def tag_visible(element):
     ]
 
     # Call the completion function
-    response = client.completion(
+    response = completion_client.completion(
         messages=messages,
         model=model,
         mode=mode,
@@ -208,10 +258,11 @@ def tag_visible(element):
     evaluation_prompt = (
         f"Evaluate the quality and relevance of the following summary for the query: '{query}'\n\n"
         f"Summary:\n{summary}\n\n"
+        "Ensure that all links parsed are from reputable sources and do not infringe any issues. "
         "Provide an explanation of your evaluation and determine if the summary is successful or needs refinement."
     )
 
-    evaluation_response = client.completion(
+    evaluation_response = completion_client.completion(
         messages=[
             {"role": "system", "content": "You are an expert evaluator of summaries."},
             {"role": "user", "content": evaluation_prompt},
@@ -232,10 +283,11 @@ def tag_visible(element):
             f"The previous summary for the query '{query}' needs improvement.\n\n"
             f"Original summary:\n{summary}\n\n"
             f"Evaluation feedback:\n{evaluation_response.explanation}\n\n"
+            "Ensure that all links parsed are from reputable sources and do not infringe any issues. "
             "Please provide an improved and refined summary addressing the feedback."
         )
 
-        refined_response = client.completion(
+        refined_response = completion_client.completion(
             messages=[
                 {
                     "role": "system",
@@ -260,10 +312,10 @@ def tag_visible(element):
         content=summary,
         metadata={
             "query": query,
-            "urls": urls,
+            "urls": all_urls,
             "model": model,
             "client": client,
-            "workflow": workflow.dict(),
+            "workflow": workflow.model_dump(),
         },
     )
 
@@ -274,9 +326,22 @@ def tag_visible(element):
 if __name__ == "__main__":
     result_document = scrape(
         query="Latest advancements in renewable energy",
+        num_queries=5,
         max_results=5,
         workers=5,
         verbose=True,
     )
     print("Final Document:")
     print(result_document.content)
+
+    class YoutubeLinks(BaseModel):
+        links: list[str] = Field(description="A list of youtube links")
+
+    result_document = scrape(
+        query="Latest advancements in renewable energy",
+        num_queries=5,
+        max_results=5,
+        workers=5,
+        verbose=True,
+        response_model=YoutubeLinks,
+    )