Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Native code blocks #377

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 32 additions & 14 deletions server/formatter.js
Original file line number Diff line number Diff line change
Expand Up @@ -106,20 +106,21 @@ function normalizeHtml(html) {
function formatCode(html) {
// Expand code blocks
html = html.replace(/```(.*?)```/ig, (match, content) => {
// strip interior <p> tags added by google
content = content.replace(/(?:<\/p><p>|<br\/?>)/g, '\n').replace(/<\/?p>/g, '').trim()
// try to find language hint within text block
const [, lang] = content.match(/^(.+?)\n/) || []

if (lang && hljs.getLanguage(lang)) {
// if the language hint exists and contains a valid language, remove it from the code block
content = content.replace(`${lang}\n`, '')

const textOnlyContent = cheerio.load(content).text()
const highlighted = hljs.highlight(lang, textOnlyContent, true)
return `<pre><code data-lang="${highlighted.language}">${formatCodeContent(highlighted.value)}</code></pre>`
}
return `<pre><code>${formatCodeContent(content)}</code></pre>`
return formatCodeBlock(content)
})

// Preformat native code blocks
// Unnest native code block start and end markers
html = html.replace(/<span[^>]*>(&#xEC0[23];)<\/span>/ig, (match, marker) => {
return marker
})

// Expand native code blocks
// Google docs interleaves the end-of-code marker with the following tag. eg:
// <p>&#xEC03;my code block</p><h2>&#xEC02;my heading</h2>
// Make sure we match and retain the following tag
html = html.replace(/<p[^>]*>&#xEC03;(.*?)<\/p>(<[^>]*>)&#xEC02;/ig, (match, content, followingTag) => {
return `${formatCodeBlock(content)}${followingTag}`
})

// Replace double backticks with <code>, for supporting backticks in inline code blocks
Expand Down Expand Up @@ -152,6 +153,23 @@ function formatCode(html) {
return html
}

function formatCodeBlock(content) {
// strip interior <p> tags added by google
content = content.replace(/(?:<\/p><p[^>]*>|<br\/?>)/g, '\n').replace(/<\/?p>/g, '').trim()
// try to find language hint within text block
const [, lang] = content.match(/^(.+?)\n/) || []

if (lang && hljs.getLanguage(lang)) {
// if the language hint exists and contains a valid language, remove it from the code block
content = content.replace(`${lang}\n`, '')

const textOnlyContent = cheerio.load(content).text()
const highlighted = hljs.highlight(lang, textOnlyContent, true)
return `<pre><code data-lang="${highlighted.language}">${formatCodeContent(highlighted.value)}</code></pre>`
}
return `<pre><code>${formatCodeContent(content)}</code></pre>`
}

function formatCodeContent(content) {
content = content.replace(/[‘’]|&#x201[89];/g, "'").replace(/[“”]|&#x201[CD];/g, '"') // remove smart quotes
content = content.replace(/`/g, '&#96;') // remove internal cases of backticks
Expand Down
1 change: 1 addition & 0 deletions test/fixtures/supportedFormats.nativeCode.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css"> @import url(https://themes.googleusercontent.com/fonts/css?kit=XGMkxXUZTA64h2imyzu79g);</style></head><body class="doc-content" style="background-color:#ffffff;padding:72pt 72pt 72pt 72pt;max-width:468pt"><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;text-align:left"><span style="color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:&quot;Arial&quot;;font-style:normal">Intro sentence.</span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;height:11pt;text-align:left"><span style="color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:&quot;Arial&quot;;font-style:normal"></span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;text-align:left"><span>&#60419;</span><span style="color:#b80672;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:9pt;font-family:&quot;Roboto Mono&quot;;font-style:normal">// This is a code block</span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;height:11pt;text-align:left"><span style="color:#b80672;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:9pt;font-family:&quot;Roboto Mono&quot;;font-style:normal"></span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;text-align:left"><span style="color:#b80672;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:9pt;font-family:&quot;Roboto Mono&quot;;font-style:normal">// Here&#39;s another line</span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;text-align:left"><span style="color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:&quot;Arial&quot;;font-style:normal">&#60418;</span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;text-align:left"><span style="color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:&quot;Arial&quot;;font-style:normal">Middle sentence.</span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;height:11pt;text-align:left"><span style="color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:&quot;Arial&quot;;font-style:normal"></span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;text-align:left"><span>&#60419;</span><span style="color:#b80672;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:9pt;font-family:&quot;Roboto Mono&quot;;font-style:normal"># A second code block</span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;text-align:left"><span style="color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:&quot;Arial&quot;;font-style:normal">&#60418;Outro sentence.</span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;height:11pt;text-align:left"><span style="color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:&quot;Arial&quot;;font-style:normal"></span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;height:11pt;text-align:left"><span style="color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:&quot;Arial&quot;;font-style:normal"></span></p><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;text-align:left"><span>&#60419;</span><span style="font-size:9pt;font-family:&quot;Roboto Mono&quot;;color:#37474f;font-weight:400">/* Another code block */</span></p><h2 id="h.w1cnitpw5bz6" style="padding-top:18pt;margin:0;color:#000000;padding-left:0;font-size:16pt;padding-bottom:6pt;line-height:1.15;page-break-after:avoid;font-family:&quot;Arial&quot;;orphans:2;widows:2;text-align:left;padding-right:0"><span style="color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:16pt;font-family:&quot;Arial&quot;;font-style:normal">&#60418;Heading following a code block</span></h2><p style="padding:0;margin:0;color:#000000;font-size:11pt;font-family:&quot;Arial&quot;;line-height:1.15;orphans:2;widows:2;height:11pt;text-align:left"><span style="color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:&quot;Arial&quot;;font-style:normal"></span></p></body></html>
141 changes: 96 additions & 45 deletions test/unit/htmlProcessing.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const {assert} = require('chai')
let {getProcessedDocAttributes} = require('../../server/formatter')

const docPath = path.join(__dirname, '../fixtures/supportedFormats.html')
const docPathNativeCode = path.join(__dirname, '../fixtures/supportedFormats.nativeCode.html')

// helper function to stub the doc and get a section of the returned document
function stubbedProcessedDoc(unprocessedHtml, editorName) {
Expand All @@ -13,15 +14,33 @@ function stubbedProcessedDoc(unprocessedHtml, editorName) {
}

describe('HTML processing', () => {
const testGlobal = {
rawHTML: null,
output: () => {},
processedHTML: null
}
const testGlobal = {}
const condenseHtml = (html) => html.replace(/\n/g, '').replace(/>\s+</g, '><')

beforeAll(() => {
testGlobal.rawHTML = fs.readFileSync(docPath, {encoding: 'utf8'})
testGlobal.processedHTML = stubbedProcessedDoc(testGlobal.rawHTML).html
testGlobal.output = cheerio.load(testGlobal.processedHTML)
// General supported formats
testGlobal.general = {}
testGlobal.general.rawHTML = fs.readFileSync(docPath, {encoding: 'utf8'})
testGlobal.general.processedHTML = stubbedProcessedDoc(testGlobal.general.rawHTML).html
testGlobal.general.output = cheerio.load(testGlobal.general.processedHTML)

// Native code
testGlobal.native = {}
testGlobal.native.rawHTML = fs.readFileSync(docPathNativeCode, {encoding: 'utf8'})
testGlobal.native.processedHTML = stubbedProcessedDoc(testGlobal.native.rawHTML).html
testGlobal.native.output = cheerio.load(testGlobal.native.processedHTML)

// Supported formats with inline code enabled
jest.resetModules()
process.env.ALLOW_INLINE_CODE = 'true'
// remove formatter from require cache to recognize changed env variable
delete require.cache[require.resolve('../../server/formatter')]
getProcessedDocAttributes = require('../../server/formatter').getProcessedDocAttributes

testGlobal.inlineCode = {}
testGlobal.inlineCode.rawHTML = fs.readFileSync(docPath, {encoding: 'utf8'})
testGlobal.inlineCode.processedHTML = stubbedProcessedDoc(testGlobal.inlineCode.rawHTML).html
testGlobal.inlineCode.output = cheerio.load(testGlobal.inlineCode.processedHTML)
})

it('does not throw when revision data is unavailable', () => {
Expand All @@ -30,110 +49,153 @@ describe('HTML processing', () => {
})

it('strips unnecessary styles', () => {
const header = testGlobal.output('h2')
const header = testGlobal.general.output('h2')
assert.equal(null, header.attr('style'))
})

it('strips unnecessary &nbsp;s', () => {
const introHTML = testGlobal.output("p:contains('Basic text format')").html()
const introHTML = testGlobal.general.output("p:contains('Basic text format')").html()
assert.match(introHTML, /Text color and highlighting/)
})

describe('inline formats', () => {
it('preserves bolds', () => {
const boldSpan = testGlobal.output("span:contains('bold')").first()
const boldSpan = testGlobal.general.output("span:contains('bold')").first()
assert.equal('font-weight:700', boldSpan.attr('style'))
})

it('preserves italics', () => {
const italicSpan = testGlobal.output("span:contains('italic')").first()
const italicSpan = testGlobal.general.output("span:contains('italic')").first()
assert.equal('font-style:italic', italicSpan.attr('style'))
})

it('preserves underlines', () => {
const underlinedSpan = testGlobal.output("span:contains('underline')").first()
const underlinedSpan = testGlobal.general.output("span:contains('underline')").first()
assert.equal('text-decoration:underline', underlinedSpan.attr('style'))
})

it('preserves combined formats', () => {
const combinedSpan = testGlobal.output("span:contains('combined')").first()
const combinedSpan = testGlobal.general.output("span:contains('combined')").first()
assert.equal('font-style:italic;font-weight:700;text-decoration:underline', combinedSpan.attr('style'))
})

it('preserves image widths', () => {
const imageWidth = testGlobal.output('img').first()
const imageWidth = testGlobal.general.output('img').first()
const widthMatch = imageWidth.attr('style').match('width')
assert.isNotNull(widthMatch)
})
})

describe('list handling', () => {
it('preserves classing on lists', () => {
const ol = testGlobal.output('ol').first()
const ol = testGlobal.general.output('ol').first()
assert.match(ol.attr('class'), /lst-/)
})

it('presrves the associated style block for lists', () => {
const olClass = testGlobal.output('ol').first().attr('class').split(' ')[0]
assert.match(testGlobal.processedHTML, new RegExp(`ol.${olClass} {`))
const olClass = testGlobal.general.output('ol').first().attr('class').split(' ')[0]
assert.match(testGlobal.general.processedHTML, new RegExp(`ol.${olClass} {`))
})

it('applies a level- class on lists to support indentation', () => {
const topLevelList = testGlobal.output("ul:contains('Item 1')").first()
const topLevelList = testGlobal.general.output("ul:contains('Item 1')").first()
assert.match(topLevelList.attr('class'), / level-0/)

const nestedList = testGlobal.output("ul:contains('Item 1.1')").first()
const nestedList = testGlobal.general.output("ul:contains('Item 1.1')").first()
assert.match(nestedList.attr('class'), / level-1/)
})
})

describe('code block handling', () => {
it('highlights registered languages', () => {
const codeBlock = testGlobal.output('pre > code[data-lang="javascript"]')
const codeBlock = testGlobal.general.output('pre > code[data-lang="javascript"]')
assert.exists(codeBlock.html())
})

it('allows &nbsp; as part of a code block', () => {
const codeBlock = testGlobal.output('pre > code[data-lang="javascript"]')
const codeBlock = testGlobal.general.output('pre > code[data-lang="javascript"]')
assert.match(codeBlock.html(), /&amp;nbsp/)
})

it('preserves whitespace at the start of a line', () => {
const codeBlock = testGlobal.output('pre > code[data-lang="javascript"]')
const codeBlock = testGlobal.general.output('pre > code[data-lang="javascript"]')
assert.match(codeBlock.html(), / +jQuery.fn.calcSubWidth/)
})

it('scrubs smart quotes', () => {
const codeBlock = testGlobal.output('pre > code[data-lang="javascript"]')
const codeBlock = testGlobal.general.output('pre > code[data-lang="javascript"]')
assert.match(codeBlock.html(), /singleQuotedStr = .*&apos;str&apos;/)
assert.match(codeBlock.html(), /doubleQuotedStr = .*&quot;str&quot;/)
})

it('allows unregistered languages', () => {
const codeBlock = testGlobal.output('pre')
const codeBlock = testGlobal.general.output('pre')
assert.match(codeBlock.html(), /1 \+ 1 == 5/)
})

it('retains code block backticks', () => {
const codeBlock = testGlobal.output('pre > code[data-lang="javascript"]')
const codeBlock = testGlobal.general.output('pre > code[data-lang="javascript"]')
assert.match(codeBlock.html(), /`/)
})

it('retains inline code backticks', () => {
const codeBlock = testGlobal.output("code:contains('backtick')")
const codeBlock = testGlobal.general.output("code:contains('backtick')")
assert.match(codeBlock.html(), /`backtick`/)
})
})

describe('native code block handling', () => {
it('formats the code blocks', () => {
const codeBlock = testGlobal.native.output('pre > code')
assert.exists(codeBlock.html())
assert.equal(codeBlock.length, 3)
})

it('leaves the trailing heading intact', () => {
const heading = testGlobal.native.output('h2')
assert.equal(heading.html(), 'Heading following a code block')
})

it('removes code block marker unicode characters', () => {
assert.notInclude(testGlobal.native.processedHTML, '&#xEC03;')
assert.notInclude(testGlobal.native.processedHTML, '&#xEC02;')
})

it('unnests start and end markers', () => {
const html = condenseHtml(`
<p>
<span style="font-weight:700">\uEC03</span>
<span style="font-weight:700">my code</span>
</p>
<p>
<span style="font-weight:700">\uEC02</span>
</p>
`)
const processedHtml = stubbedProcessedDoc(html).html
assert.equal(processedHtml, '<pre><code><span style="font-weight:700">my code</span></code></pre>\n<p></p>')
})

it('removes interior <p> tags with attributes', () => {
const html = condenseHtml(`
<p style="font-style:italic">\uEC03my code</p>
<p style="font-style:italic">more code</p>
<p style="font-style:italic">\uEC02</p>
`)
console.log(html)
const processedHtml = stubbedProcessedDoc(html).html
assert.equal(processedHtml, '<pre><code>my code\nmore code</code></pre>\n<p style="font-style:italic"></p>')
})
})

describe('inline code handling', () => {
describe('with inline code disabled', () => {
it('does not modify code block content', () => {
const codeBlock = testGlobal.output("pre:contains('codeblocks will not')")
const codeBlock = testGlobal.general.output("pre:contains('codeblocks will not')")
assert.match(codeBlock.html(), /&lt;.*%-.*%&gt;/)
})

it('does not unescape delimited code', () => {
const className = testGlobal.output("p:contains('.purplePapyrus')")
const className = testGlobal.general.output("p:contains('.purplePapyrus')")
const styleTag = className.prev()
const openingTag = styleTag.prev()

Expand All @@ -143,25 +205,14 @@ describe('HTML processing', () => {
})

describe('with inline code enabled', () => {
beforeAll(() => {
jest.resetModules()
process.env.ALLOW_INLINE_CODE = 'true'
// remove formatter from require cache to recognize changed env variable
delete require.cache[require.resolve('../../server/formatter')]
getProcessedDocAttributes = require('../../server/formatter').getProcessedDocAttributes
const rawHTML = fs.readFileSync(docPath, {encoding: 'utf8'})
const processedHTML = stubbedProcessedDoc(rawHTML).html
testGlobal.codeEnabledOut = cheerio.load(processedHTML)
})

it('does not modify code block content', () => {
const codeBlock = testGlobal.codeEnabledOut("pre:contains('codeblocks will not')")
const codeBlock = testGlobal.inlineCode.output("pre:contains('codeblocks will not')")
assert.match(codeBlock.html(), /&lt;.*%-.*%&gt;/)
})

it('properly unescapes delimited code', () => {
const style = testGlobal.codeEnabledOut("style:contains('.purplePapyrus')")
const styledDiv = testGlobal.codeEnabledOut('div.purplePapyrus')
const style = testGlobal.inlineCode.output("style:contains('.purplePapyrus')")
const styledDiv = testGlobal.inlineCode.output('div.purplePapyrus')

assert.exists(style)
assert.exists(styledDiv)
Expand All @@ -173,11 +224,11 @@ describe('HTML processing', () => {

describe('comment handling', () => {
it('strips comments', () => {
assert.notMatch(testGlobal.processedHTML, /This comment text will not appear/)
assert.notMatch(testGlobal.general.processedHTML, /This comment text will not appear/)
})

it('strips inline comment anchors', () => {
const commentAnchorParent = testGlobal.output("p:contains('will be stripped from the')")
const commentAnchorParent = testGlobal.general.output("p:contains('will be stripped from the')")
assert.notMatch(commentAnchorParent, /\[a\]/)
})
})
Expand Down