Skip to content

Commit

Permalink
Merge pull request #9 from D4Vinci/dev
Browse files Browse the repository at this point in the history
v0.2.2
  • Loading branch information
D4Vinci authored Nov 16, 2024
2 parents 105ec5b + 0c6e770 commit 50cd40c
Show file tree
Hide file tree
Showing 10 changed files with 41 additions and 18 deletions.
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.

```python
>> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
>> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
# Fetch websites' source under the radar!
>> page = StealthyFetcher().fetch('https://example.com', headless=True, network_idle=True)
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
>> print(page.status)
200
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
Expand Down Expand Up @@ -211,12 +211,21 @@ python -m browserforge update
```
## Fetching Websites Features
All fetcher-type classes are imported in the same way
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
```python
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
```
And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
```python
from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
```
then use it right away without initializing like:
```python
page = StealthyFetcher.fetch('https://example.com')
```

Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
> [!NOTE]
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
Expand Down
2 changes: 1 addition & 1 deletion scrapling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from scrapling.core.custom_types import TextHandler, AttributesHandler

__author__ = "Karim Shoair (karim.shoair@pm.me)"
__version__ = "0.2.1"
__version__ = "0.2.2"
__copyright__ = "Copyright (c) 2024 Karim Shoair"


Expand Down
6 changes: 6 additions & 0 deletions scrapling/defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher

# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
Fetcher = Fetcher()
StealthyFetcher = StealthyFetcher()
PlayWrightFetcher = PlayWrightFetcher()
4 changes: 2 additions & 2 deletions scrapling/engines/camo.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@ def fetch(self, url: str) -> Response:
response = Response(
url=res.url,
text=page.content(),
content=res.body(),
body=res.body(),
status=res.status,
reason=res.status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
request_headers=res.request.all_headers(),
adaptor_arguments=self.adaptor_arguments
**self.adaptor_arguments
)
page.close()

Expand Down
4 changes: 2 additions & 2 deletions scrapling/engines/pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,14 +224,14 @@ def fetch(self, url: str) -> Response:
response = Response(
url=res.url,
text=page.content(),
content=res.body(),
body=res.body(),
status=res.status,
reason=res.status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
request_headers=res.request.all_headers(),
adaptor_arguments=self.adaptor_arguments
**self.adaptor_arguments
)
page.close()
return response
4 changes: 2 additions & 2 deletions scrapling/engines/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ def _prepare_response(self, response: httpxResponse) -> Response:
return Response(
url=str(response.url),
text=response.text,
content=response.content,
body=response.content,
status=response.status_code,
reason=response.reason_phrase,
encoding=response.encoding or 'utf-8',
cookies=dict(response.cookies),
headers=dict(response.headers),
request_headers=dict(response.request.headers),
adaptor_arguments=self.adaptor_arguments
**self.adaptor_arguments
)

def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
Expand Down
7 changes: 3 additions & 4 deletions scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@
class Response(Adaptor):
"""This class is returned by all engines as a way to unify response type between different libraries."""

def __init__(self, url: str, text: str, content: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, adaptor_arguments: Dict, encoding: str = 'utf-8'):
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, encoding: str = 'utf-8', **adaptor_arguments: Dict):
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)

self.status = status
self.reason = reason
self.cookies = cookies
self.headers = headers
self.request_headers = request_headers
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
# For back-ward compatibility
self.adaptor = self

Expand All @@ -31,7 +30,7 @@ def __init__(self, url: str, text: str, content: bytes, status: int, reason: str
class BaseFetcher:
def __init__(
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
automatch_domain: Optional[str] = None,
):
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
Expand Down
13 changes: 11 additions & 2 deletions scrapling/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(
storage: Any = SQLiteStorageSystem,
storage_args: Optional[Dict] = None,
debug: Optional[bool] = True,
**kwargs
):
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
Expand Down Expand Up @@ -117,6 +118,10 @@ def __init__(
self.__attributes = None
self.__tag = None
self.__debug = debug
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
self.__response_data = {
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
} if hasattr(self, 'status') else {}

# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
@staticmethod
Expand All @@ -138,10 +143,14 @@ def __get_correct_result(
return TextHandler(str(element))
else:
if issubclass(type(element), html.HtmlMixin):

return self.__class__(
root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
root=element,
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
huge_tree=self.__huge_tree_enabled, debug=self.__debug
huge_tree=self.__huge_tree_enabled, debug=self.__debug,
**self.__response_data
)
return element

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = scrapling
version = 0.2.1
version = 0.2.2
author = Karim Shoair
author_email = karim.shoair@pm.me
description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name="scrapling",
version="0.2.1",
version="0.2.2",
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
impressive speed improvements over many popular scraping tools.""",
Expand Down

0 comments on commit 50cd40c

Please sign in to comment.