Skip to content

Commit

Permalink
Improving scraper (adding title)
Browse files Browse the repository at this point in the history
  • Loading branch information
polterguy committed Feb 8, 2024
1 parent ada155b commit f75bed1
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 56 deletions.
2 changes: 1 addition & 1 deletion backend/backend.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

<ItemGroup>
<PackageReference Include="magic.lambda.system" Version="17.2.0" />
<PackageReference Include="magic.library" Version="17.3.2" />
<PackageReference Include="magic.library" Version="17.3.3" />
<PackageReference Include="Microsoft.AspNetCore.Mvc.NewtonsoftJson" Version="8.0.0" />
</ItemGroup>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,13 @@
*
* - [html] - Mandatory. Being the actual HTML we're traversing for training data.
* - [url] - Optional. Being the URL from where the document was fetched
* - [semantic] - Optional. If true, will try to start scraping from semantic tags, such as article and main.
*/
slots.create:magic.ai.html.extract

// Sanity checking invocation.
validators.mandatory:x:@.arguments/*/html
validators.mandatory:x:@.arguments/*/url
validators.url:x:@.arguments/*/url
validators.default:x:@.arguments
semantic:bool:false

// Used to return meta information to caller.
.meta
Expand All @@ -36,45 +33,11 @@ slots.create:magic.ai.html.extract
// Buffer used for URLs from HTML document.
.urls

// Checking if caller wants to convert HTML semantically or not.
if:x:@.arguments/*/semantic

// Converting HTML to lambda such that we can semantically inspect HTML for semantic tags.
html2lambda:x:@.arguments/*/html

// Trying to retrieve HTML from main or article tag.
if
exists:x:@html2lambda/**/article/[0,1]
.lambda

// HTML contains "article" HTML element.
lambda2html:x:@html2lambda/**/article/[0,1]
html2markdown:x:@lambda2html
url:x:@.arguments/*/url
set-value:x:@.markdown
get-value:x:@html2markdown

else-if
exists:x:@html2lambda/**/main/[0,1]
.lambda

// HTML contains "main" HTML element.
lambda2html:x:@html2lambda/**/main/[0,1]
html2markdown:x:@lambda2html
url:x:@.arguments/*/url
set-value:x:@.markdown
get-value:x:@html2markdown

// Checking if we've got some Markdown, and if not, resorting to using entire HTML document.
if
null:x:@.markdown
.lambda

// Using raw HTML.
html2markdown:x:@.arguments/*/html
url:x:@.arguments/*/url
set-value:x:@.markdown
get-value:x:@html2markdown
// Converting HTML to Markdown.
html2markdown:x:@.arguments/*/html
url:x:@.arguments/*/url
set-value:x:@.markdown
get-value:x:@html2markdown

// Checking if site is SPA, at which point we return early.
if
Expand All @@ -91,17 +54,22 @@ slots.create:magic.ai.html.extract
meta
main:int:0

// Converting raw HTML to lambda to allow us to extract title, description, hyperlinks, etc.
.html-lambda
add:x:@.html-lambda
html2lambda:x:@.arguments/*/html

/*
* Finding URLs from document.
*
* Notice, for simplicity reasons we do this by round tripping HTML
* through markdown, converting it to HTML, conerting HTML to lambda,
* and iterate through each anchor HTML element in lambda.
* Notice, for simplicity reasons we do this by round tripping through HTML,
* for then to convert HTML to lambda, and iterate through each anchor HTML
* element in lambda.
*
* This is not optimal, and could be optimised, but it keeps the code
* DRY at least ...
* DRY at least, since our [html2markdown] slot at this point have resolved
* our relative URLs ...
*/
html2markdown:x:@.arguments/*/html
url:x:@.arguments/*/url
markdown2html:x:@html2markdown
html2lambda:x:@markdown2html
for-each:x:@html2lambda/**/a/*/\@href
Expand Down Expand Up @@ -164,9 +132,9 @@ slots.create:magic.ai.html.extract

// Setting title and description from document.
set-value:x:@.title
get-value:x:@html2lambda/**/meta/**/title/*/\#text
get-value:x:@.html-lambda/**/head/**/title/*/\#text
set-value:x:@.description
get-value:x:@html2lambda/**/meta/**/meta/*/\@name/=description/./*/\@content
get-value:x:@.html-lambda/**/head/**/meta/*/\@name/=description/./*/\@content

// Creating our prompt.
.prompt
Expand Down Expand Up @@ -213,11 +181,15 @@ slots.create:magic.ai.html.extract
lambda2html:x:@.dp/#
html2markdown:x:@lambda2html
url:x:@.arguments/*/url
strings.concat
get-value:x:@.title
.:" | "
get-value:x:@.tmp-prompt
unwrap:x:+/*/*/*
add:x:@.snippets
.
.
prompt:x:@.tmp-prompt
prompt:x:@strings.concat
completion:x:@html2markdown

default
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ select vss.distance, vss.rowid as id, ts.prompt, ts.completion, ts.uri, ts.cache
*
* This allows you to have responses that are statically cached, yet still
* dependent upon semantic AI search towards your training snippets.
*
* Noticed, caches snippets are executed as mixins if they are the first match.
*/
if
and
Expand All @@ -147,6 +149,10 @@ select vss.distance, vss.rowid as id, ts.prompt, ts.completion, ts.uri, ts.cache
.lambda

// First matching snippet has been cached.
unwrap:x:+/*/*
set-value:x:@.scan/0/*/completion
strings.mixin:x:@.scan/0/*/completion
prompt:x:@.arguments/*/prompt
unwrap:x:+/*
return
cached:x:@.scan/0/*/completion
Expand All @@ -167,8 +173,7 @@ select vss.distance, vss.rowid as id, ts.prompt, ts.completion, ts.uri, ts.cache

/*
* To avoid mixin snippets from being executed unless they're matched at the top,
* we pre-process results, executing mixins for the first snippet, for then to
* remove all mixin snippets matched further down.
* we pre-process results, and remove all mixin snippets not matched as first match.
*/
if
exists:x:@.scan/0/*/completion
Expand Down
1 change: 0 additions & 1 deletion backend/files/system/openai/query.get.hl
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ if
code:bool:true
lists:bool:true
main:bool:true
semantic:bool:true
html:x:@signal/@signal

// Building our context making sure we don't overflow 10,000 tokens.
Expand Down
2 changes: 1 addition & 1 deletion backend/slots/Version.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public class Version : ISlot
/// <param name="input">Parameters passed from signaler</param>
public void Signal(ISignaler signaler, Node input)
{
input.Value = "v17.3.2";
input.Value = "v17.3.3";
}
}
}

0 comments on commit f75bed1

Please sign in to comment.