Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/fix-html-entity-escaping-markdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
'@tiptap/core': patch
'@tiptap/markdown': patch
---

Fix HTML character escaping in markdown roundtrip. HTML entities (`&lt;`, `&gt;`, `&amp;`, `&quot;`) are now decoded to literal characters when parsing markdown into the editor. `<`, `>`, and `&` are re-encoded when serializing back to markdown, while `"` is preserved as a literal character since double quotes are ordinary in markdown. Code detection for skipping encoding now uses the `code: true` extension spec instead of hardcoded type names. Literal characters inside code blocks and inline code are always preserved.
55 changes: 55 additions & 0 deletions packages/core/src/__tests__/htmlEntities.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import { describe, expect, it } from 'vitest'

import { decodeHtmlEntities, encodeHtmlEntities } from '../utilities/htmlEntities.js'

describe('decodeHtmlEntities', () => {
it('decodes &lt; to <', () => {
expect(decodeHtmlEntities('&lt;div&gt;')).toBe('<div>')
})

it('decodes &amp; to &', () => {
expect(decodeHtmlEntities('a &amp; b')).toBe('a & b')
})

it('decodes &quot; to "', () => {
expect(decodeHtmlEntities('&quot;hello&quot;')).toBe('"hello"')
})

it('handles doubly-encoded sequences like &amp;lt;', () => {
expect(decodeHtmlEntities('&amp;lt;')).toBe('&lt;')
})

it('returns plain text unchanged', () => {
expect(decodeHtmlEntities('hello world')).toBe('hello world')
})
})

describe('encodeHtmlEntities', () => {
it('encodes < to &lt;', () => {
expect(encodeHtmlEntities('<div>')).toBe('&lt;div&gt;')
})

it('encodes & to &amp;', () => {
expect(encodeHtmlEntities('a & b')).toBe('a &amp; b')
})

it('does not encode " (quotes are valid in markdown)', () => {
expect(encodeHtmlEntities('"hello"')).toBe('"hello"')
})

it('returns plain text unchanged', () => {
expect(encodeHtmlEntities('hello world')).toBe('hello world')
})
})

describe('roundtrip', () => {
it.each(['<div>', 'a & b', 'x < y & y > z'])('encode then decode roundtrips: %s', input => {
expect(decodeHtmlEntities(encodeHtmlEntities(input))).toBe(input)
})

it('decode is a superset of encode – &quot; decodes but " is not encoded', () => {
// " passes through encode unchanged, &quot; decodes to "
expect(encodeHtmlEntities('"hello"')).toBe('"hello"')
expect(decodeHtmlEntities('&quot;hello&quot;')).toBe('"hello"')
})
})
26 changes: 26 additions & 0 deletions packages/core/src/utilities/htmlEntities.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* Decode common HTML entities in text content so they display as literal
* characters inside the editor. The decode order matters: `&amp;` must be
* decoded **last** so that doubly-encoded sequences like `&amp;lt;` first
* survive the `&lt;` pass and then correctly become `&lt;` (not `<`).
*/
export function decodeHtmlEntities(text: string): string {
return text
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&amp;/g, '&')
}

/**
* Encode HTML special characters so they roundtrip safely through markdown.
* `&` is encoded **first** to avoid double-encoding the ampersand in other
* entities (e.g. `<` → `&lt;`, not `&amp;lt;`).
*
* Note: `"` is intentionally NOT encoded here because double quotes are
* ordinary characters in markdown and do not need escaping. The decode
* function still handles `&quot;` because the markdown tokenizer may emit it.
*/
export function encodeHtmlEntities(text: string): string {
return text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
}
1 change: 1 addition & 0 deletions packages/core/src/utilities/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export * from './elementFromString.js'
export * from './escapeForRegEx.js'
export * from './findDuplicates.js'
export * from './fromString.js'
export * from './htmlEntities.js'
export * from './isAndroid.js'
export * from './isEmptyObject.js'
export * from './isFirefox.js'
Expand Down
182 changes: 182 additions & 0 deletions packages/markdown/__tests__/conversion.spec.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import type { Extension } from '@tiptap/core'
import { Bold } from '@tiptap/extension-bold'
import { Code } from '@tiptap/extension-code'
import { CodeBlock } from '@tiptap/extension-code-block'
import { Document } from '@tiptap/extension-document'
import { HardBreak } from '@tiptap/extension-hard-break'
Expand All @@ -23,6 +24,7 @@ describe('Markdown Conversion Tests', () => {
Text,
Bold,
Italic,
Code,
Link,
Heading,
HardBreak,
Expand Down Expand Up @@ -390,4 +392,184 @@ describe('Markdown Conversion Tests', () => {
expect(tildeJSON).toEqual(backtickJSON)
})
})

describe('HTML character escaping', () => {
it('should decode &lt; and &gt; entities to literal < and > when parsing', () => {
const markdown = 'foo &lt;bar&gt; baz'
const json = markdownManager.parse(markdown)

expect(json.content).toHaveLength(1)
expect(json.content[0].type).toBe('paragraph')
expect(json.content[0].content).toHaveLength(1)
expect(json.content[0].content[0].text).toBe('foo <bar> baz')
})

it('should decode &amp; entity to literal & when parsing', () => {
const markdown = 'foo &amp; bar'
const json = markdownManager.parse(markdown)

expect(json.content[0].content[0].text).toBe('foo & bar')
})

it('should encode < and > back to entities when serializing', () => {
const json = {
type: 'doc',
content: [
{
type: 'paragraph',
content: [{ type: 'text', text: 'foo <bar> baz' }],
},
],
}

const markdown = markdownManager.serialize(json)
expect(markdown).toBe('foo &lt;bar&gt; baz')
})

it('should encode & back to &amp; when serializing', () => {
const json = {
type: 'doc',
content: [
{
type: 'paragraph',
content: [{ type: 'text', text: 'foo & bar' }],
},
],
}

const markdown = markdownManager.serialize(json)
expect(markdown).toBe('foo &amp; bar')
})

it('should roundtrip &lt;bar&gt; correctly', () => {
const markdown = 'foo &lt;bar&gt; baz'
const json = markdownManager.parse(markdown)

// Editor should show literal <bar>
expect(json.content[0].content[0].text).toBe('foo <bar> baz')

// Serialize back should produce the entity form
const serialized = markdownManager.serialize(json)
expect(serialized).toBe('foo &lt;bar&gt; baz')
})

it('should roundtrip &amp; correctly', () => {
const markdown = 'foo &amp; bar'
const json = markdownManager.parse(markdown)
expect(json.content[0].content[0].text).toBe('foo & bar')

const serialized = markdownManager.serialize(json)
expect(serialized).toBe('foo &amp; bar')
})

it('should decode &quot; entity to literal " when parsing', () => {
const markdown = 'foo &quot;bar&quot; baz'
const json = markdownManager.parse(markdown)

expect(json.content[0].content[0].text).toBe('foo "bar" baz')
})

it('should not encode " when serializing (quotes are valid markdown)', () => {
const json = {
type: 'doc',
content: [
{
type: 'paragraph',
content: [{ type: 'text', text: 'foo "bar" baz' }],
},
],
}

const markdown = markdownManager.serialize(json)
expect(markdown).toBe('foo "bar" baz')
})

it('should decode &quot; when parsing but serialize as literal "', () => {
const markdown = 'foo &quot;bar&quot; baz'
const json = markdownManager.parse(markdown)
expect(json.content[0].content[0].text).toBe('foo "bar" baz')

const serialized = markdownManager.serialize(json)
expect(serialized).toBe('foo "bar" baz')
})

it('should not encode entities inside code blocks', () => {
const json = {
type: 'doc',
content: [
{
type: 'codeBlock',
attrs: { language: null },
content: [{ type: 'text', text: 'foo <bar> & baz' }],
},
],
}

const markdown = markdownManager.serialize(json)
expect(markdown).toBe('```\nfoo <bar> & baz\n```')
})

it('should not encode entities inside inline code marks', () => {
const json = {
type: 'doc',
content: [
{
type: 'paragraph',
content: [
{
type: 'text',
text: '<tag>',
marks: [{ type: 'code' }],
},
],
},
],
}

const markdown = markdownManager.serialize(json)
expect(markdown).toBe('`<tag>`')
})

it('should handle doubly-encoded entities correctly', () => {
// &amp;lt; should decode to &lt; (not to <)
const markdown = 'foo &amp;lt; bar'
const json = markdownManager.parse(markdown)
expect(json.content[0].content[0].text).toBe('foo &lt; bar')

// Serializing should re-encode the & in &lt;
const serialized = markdownManager.serialize(json)
expect(serialized).toBe('foo &amp;lt; bar')
})

it('should preserve &nbsp; empty paragraph behavior', () => {
const markdown = 'Line1\n\n&nbsp;\n\nLine2'
const json = markdownManager.parse(markdown)

// Empty paragraph check should still work
expect(json.content).toHaveLength(3)
expect(json.content[1].type).toBe('paragraph')
expect(json.content[1].content).toEqual([])

// A single empty paragraph between content paragraphs uses blank-line
// spacing (the first empty paragraph doesn't need an &nbsp; marker).
const serialized = markdownManager.serialize(json)
expect(serialized).toBe('Line1\n\n\n\nLine2')
})

it('should roundtrip literal &amp;nbsp; without it being treated as an empty paragraph marker', () => {
// A user writing &amp;nbsp; in markdown intends for the text "&nbsp;" to display.
// decodeHtmlEntities decodes &amp; → &, producing text content "&nbsp;".
// On serialization, encodeHtmlEntities re-encodes & → &amp;, restoring &amp;nbsp;.
// The intermediate "&nbsp;" text must NOT be confused with the empty-paragraph marker.
const markdown = 'before &amp;nbsp; after'
const json = markdownManager.parse(markdown)

expect(json.content).toHaveLength(1)
expect(json.content[0].type).toBe('paragraph')
expect(json.content[0].content[0].text).toBe('before &nbsp; after')

const serialized = markdownManager.serialize(json)
expect(serialized).toBe('before &amp;nbsp; after')
})
})
})
34 changes: 30 additions & 4 deletions packages/markdown/src/MarkdownManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ import {
type MarkdownToken,
type MarkdownTokenizer,
type RenderContext,
callOrReturn,
decodeHtmlEntities,
encodeHtmlEntities,
flattenExtensions,
generateJSON,
getExtensionField,
Expand All @@ -34,6 +37,8 @@ export class MarkdownManager {
private indentSize: number
private baseExtensions: AnyExtension[] = []
private extensions: AnyExtension[] = []
/** Set of extension names whose `code` spec property is truthy (nodes and marks). */
private codeTypes: Set<string> = new Set()

/**
* Create a MarkdownManager.
Expand Down Expand Up @@ -99,7 +104,15 @@ export class MarkdownManager {
// Keep track of all extensions for HTML parsing
this.extensions.push(extension)

// Track extensions that declare `code: true` so we can skip HTML entity
// encoding inside code contexts without hardcoding specific type names.
const isCode = callOrReturn(getExtensionField(extension, 'code'))

const name = extension.name

if (isCode) {
this.codeTypes.add(name)
}
const tokenName =
(getExtensionField(extension, 'markdownTokenName') as ExtendableConfig['markdownTokenName']) || name
const parseMarkdown = getExtensionField(extension, 'parseMarkdown') as ExtendableConfig['parseMarkdown'] | undefined
Expand Down Expand Up @@ -631,9 +644,10 @@ export class MarkdownManager {
const token = tokens[i]

if (token.type === 'text') {
// Create text node – decode HTML entities so that e.g. `&lt;` displays as `<` in the editor
result.push({
type: 'text',
text: token.text || '',
text: decodeHtmlEntities(token.text || ''),
})
} else if (token.type === 'html') {
// Handle possible split inline HTML by attempting to detect an
Expand Down Expand Up @@ -797,7 +811,7 @@ export class MarkdownManager {
case 'text':
return {
type: 'text',
text: token.text || '',
text: decodeHtmlEntities(token.text || ''),
}

case 'html':
Expand Down Expand Up @@ -875,6 +889,18 @@ export class MarkdownManager {
}
}

/**
* Encode HTML entities in text unless the node is inside a code context
* (code mark or code-block parent) where literal characters should be preserved.
*/
private encodeTextForMarkdown(text: string, node: JSONContent, parentNode?: JSONContent): string {
const isInsideCode =
(parentNode?.type != null && this.codeTypes.has(parentNode.type)) ||
(node.marks || []).some(m => this.codeTypes.has(typeof m === 'string' ? m : m.type))

return isInsideCode ? text : encodeHtmlEntities(text)
}

renderNodeToMarkdown(
node: JSONContent,
parentNode?: JSONContent,
Expand All @@ -885,7 +911,7 @@ export class MarkdownManager {
// if node is a text node, we simply return it's text content
// marks are handled at the array level in renderNodesWithMarkBoundaries
if (node.type === 'text') {
return node.text || ''
return this.encodeTextForMarkdown(node.text || '', node, parentNode)
}

if (!node.type) {
Expand Down Expand Up @@ -982,7 +1008,7 @@ export class MarkdownManager {
}

if (node.type === 'text') {
let textContent = node.text || ''
let textContent = this.encodeTextForMarkdown(node.text || '', node, parentNode)
const currentMarks = new Map((node.marks || []).map(mark => [mark.type, mark]))

// Find marks that need to be closed and opened
Expand Down
Loading