ueberdosis · bdbch · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/.changeset/fix-html-entity-escaping-markdown.md b/.changeset/fix-html-entity-escaping-markdown.md
@@ -0,0 +1,6 @@
+---
+'@tiptap/core': patch
+'@tiptap/markdown': patch
+---
+
+Fix HTML character escaping in markdown roundtrip. HTML entities (`&lt;`, `&gt;`, `&amp;`, `&quot;`) are now decoded to literal characters when parsing markdown into the editor. `<`, `>`, and `&` are re-encoded when serializing back to markdown, while `"` is preserved as a literal character since double quotes are ordinary in markdown. Code detection for skipping encoding now uses the `code: true` extension spec instead of hardcoded type names. Literal characters inside code blocks and inline code are always preserved.
diff --git a/packages/core/src/__tests__/htmlEntities.test.ts b/packages/core/src/__tests__/htmlEntities.test.ts
@@ -0,0 +1,55 @@
+import { describe, expect, it } from 'vitest'
+
+import { decodeHtmlEntities, encodeHtmlEntities } from '../utilities/htmlEntities.js'
+
+describe('decodeHtmlEntities', () => {
+  it('decodes &lt; to <', () => {
+    expect(decodeHtmlEntities('&lt;div&gt;')).toBe('<div>')
+  })
+
+  it('decodes &amp; to &', () => {
+    expect(decodeHtmlEntities('a &amp; b')).toBe('a & b')
+  })
+
+  it('decodes &quot; to "', () => {
+    expect(decodeHtmlEntities('&quot;hello&quot;')).toBe('"hello"')
+  })
+
+  it('handles doubly-encoded sequences like &amp;lt;', () => {
+    expect(decodeHtmlEntities('&amp;lt;')).toBe('&lt;')
+  })
+
+  it('returns plain text unchanged', () => {
+    expect(decodeHtmlEntities('hello world')).toBe('hello world')
+  })
+})
+
+describe('encodeHtmlEntities', () => {
+  it('encodes < to &lt;', () => {
+    expect(encodeHtmlEntities('<div>')).toBe('&lt;div&gt;')
+  })
+
+  it('encodes & to &amp;', () => {
+    expect(encodeHtmlEntities('a & b')).toBe('a &amp; b')
+  })
+
+  it('does not encode " (quotes are valid in markdown)', () => {
+    expect(encodeHtmlEntities('"hello"')).toBe('"hello"')
+  })
+
+  it('returns plain text unchanged', () => {
+    expect(encodeHtmlEntities('hello world')).toBe('hello world')
+  })
+})
+
+describe('roundtrip', () => {
+  it.each(['<div>', 'a & b', 'x < y & y > z'])('encode then decode roundtrips: %s', input => {
+    expect(decodeHtmlEntities(encodeHtmlEntities(input))).toBe(input)
+  })
+
+  it('decode is a superset of encode – &quot; decodes but " is not encoded', () => {
+    // " passes through encode unchanged, &quot; decodes to "
+    expect(encodeHtmlEntities('"hello"')).toBe('"hello"')
+    expect(decodeHtmlEntities('&quot;hello&quot;')).toBe('"hello"')
+  })
+})
diff --git a/packages/core/src/utilities/htmlEntities.ts b/packages/core/src/utilities/htmlEntities.ts
@@ -0,0 +1,26 @@
+/**
+ * Decode common HTML entities in text content so they display as literal
+ * characters inside the editor.  The decode order matters: `&amp;` must be
+ * decoded **last** so that doubly-encoded sequences like `&amp;lt;` first
+ * survive the `&lt;` pass and then correctly become `&lt;` (not `<`).
+ */
+export function decodeHtmlEntities(text: string): string {
+  return text
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&amp;/g, '&')
+}
+
+/**
+ * Encode HTML special characters so they roundtrip safely through markdown.
+ * `&` is encoded **first** to avoid double-encoding the ampersand in other
+ * entities (e.g. `<` → `&lt;`, not `&amp;lt;`).
+ *
+ * Note: `"` is intentionally NOT encoded here because double quotes are
+ * ordinary characters in markdown and do not need escaping.  The decode
+ * function still handles `&quot;` because the markdown tokenizer may emit it.
+ */
+export function encodeHtmlEntities(text: string): string {
+  return text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
+}
diff --git a/packages/core/src/utilities/index.ts b/packages/core/src/utilities/index.ts
@@ -6,6 +6,7 @@ export * from './elementFromString.js'
 export * from './escapeForRegEx.js'
 export * from './findDuplicates.js'
 export * from './fromString.js'
+export * from './htmlEntities.js'
 export * from './isAndroid.js'
 export * from './isEmptyObject.js'
 export * from './isFirefox.js'

diff --git a/packages/markdown/__tests__/conversion.spec.ts b/packages/markdown/__tests__/conversion.spec.ts
@@ -1,5 +1,6 @@
 import type { Extension } from '@tiptap/core'
 import { Bold } from '@tiptap/extension-bold'
+import { Code } from '@tiptap/extension-code'
 import { CodeBlock } from '@tiptap/extension-code-block'
 import { Document } from '@tiptap/extension-document'
 import { HardBreak } from '@tiptap/extension-hard-break'
@@ -23,6 +24,7 @@ describe('Markdown Conversion Tests', () => {
     Text,
     Bold,
     Italic,
+    Code,
     Link,
     Heading,
     HardBreak,
@@ -390,4 +392,184 @@ describe('Markdown Conversion Tests', () => {
       expect(tildeJSON).toEqual(backtickJSON)
     })
   })
+
+  describe('HTML character escaping', () => {
+    it('should decode &lt; and &gt; entities to literal < and > when parsing', () => {
+      const markdown = 'foo &lt;bar&gt; baz'
+      const json = markdownManager.parse(markdown)
+
+      expect(json.content).toHaveLength(1)
+      expect(json.content[0].type).toBe('paragraph')
+      expect(json.content[0].content).toHaveLength(1)
+      expect(json.content[0].content[0].text).toBe('foo <bar> baz')
+    })
+
+    it('should decode &amp; entity to literal & when parsing', () => {
+      const markdown = 'foo &amp; bar'
+      const json = markdownManager.parse(markdown)
+
+      expect(json.content[0].content[0].text).toBe('foo & bar')
+    })
+
+    it('should encode < and > back to entities when serializing', () => {
+      const json = {
+        type: 'doc',
+        content: [
+          {
+            type: 'paragraph',
+            content: [{ type: 'text', text: 'foo <bar> baz' }],
+          },
+        ],
+      }
+
+      const markdown = markdownManager.serialize(json)
+      expect(markdown).toBe('foo &lt;bar&gt; baz')
+    })
+
+    it('should encode & back to &amp; when serializing', () => {
+      const json = {
+        type: 'doc',
+        content: [
+          {
+            type: 'paragraph',
+            content: [{ type: 'text', text: 'foo & bar' }],
+          },
+        ],
+      }
+
+      const markdown = markdownManager.serialize(json)
+      expect(markdown).toBe('foo &amp; bar')
+    })
+
+    it('should roundtrip &lt;bar&gt; correctly', () => {
+      const markdown = 'foo &lt;bar&gt; baz'
+      const json = markdownManager.parse(markdown)
+
+      // Editor should show literal <bar>
+      expect(json.content[0].content[0].text).toBe('foo <bar> baz')
+
+      // Serialize back should produce the entity form
+      const serialized = markdownManager.serialize(json)
+      expect(serialized).toBe('foo &lt;bar&gt; baz')
+    })
+
+    it('should roundtrip &amp; correctly', () => {
+      const markdown = 'foo &amp; bar'
+      const json = markdownManager.parse(markdown)
+      expect(json.content[0].content[0].text).toBe('foo & bar')
+
+      const serialized = markdownManager.serialize(json)
+      expect(serialized).toBe('foo &amp; bar')
+    })
+
+    it('should decode &quot; entity to literal " when parsing', () => {
+      const markdown = 'foo &quot;bar&quot; baz'
+      const json = markdownManager.parse(markdown)
+
+      expect(json.content[0].content[0].text).toBe('foo "bar" baz')
+    })
+
+    it('should not encode " when serializing (quotes are valid markdown)', () => {
+      const json = {
+        type: 'doc',
+        content: [
+          {
+            type: 'paragraph',
+            content: [{ type: 'text', text: 'foo "bar" baz' }],
+          },
+        ],
+      }
+
+      const markdown = markdownManager.serialize(json)
+      expect(markdown).toBe('foo "bar" baz')
+    })
+
+    it('should decode &quot; when parsing but serialize as literal "', () => {
+      const markdown = 'foo &quot;bar&quot; baz'
+      const json = markdownManager.parse(markdown)
+      expect(json.content[0].content[0].text).toBe('foo "bar" baz')
+
+      const serialized = markdownManager.serialize(json)
+      expect(serialized).toBe('foo "bar" baz')
+    })
+
+    it('should not encode entities inside code blocks', () => {
+      const json = {
+        type: 'doc',
+        content: [
+          {
+            type: 'codeBlock',
+            attrs: { language: null },
+            content: [{ type: 'text', text: 'foo <bar> & baz' }],
+          },
+        ],
+      }
+
+      const markdown = markdownManager.serialize(json)
+      expect(markdown).toBe('```\nfoo <bar> & baz\n```')
+    })
+
+    it('should not encode entities inside inline code marks', () => {
+      const json = {
+        type: 'doc',
+        content: [
+          {
+            type: 'paragraph',
+            content: [
+              {
+                type: 'text',
+                text: '<tag>',
+                marks: [{ type: 'code' }],
+              },
+            ],
+          },
+        ],
+      }
+
+      const markdown = markdownManager.serialize(json)
+      expect(markdown).toBe('`<tag>`')
+    })
+
+    it('should handle doubly-encoded entities correctly', () => {
+      // &amp;lt; should decode to &lt; (not to <)
+      const markdown = 'foo &amp;lt; bar'
+      const json = markdownManager.parse(markdown)
+      expect(json.content[0].content[0].text).toBe('foo &lt; bar')
+
+      // Serializing should re-encode the & in &lt;
+      const serialized = markdownManager.serialize(json)
+      expect(serialized).toBe('foo &amp;lt; bar')
+    })
+
+    it('should preserve &nbsp; empty paragraph behavior', () => {
+      const markdown = 'Line1\n\n&nbsp;\n\nLine2'
+      const json = markdownManager.parse(markdown)
+
+      // Empty paragraph check should still work
+      expect(json.content).toHaveLength(3)
+      expect(json.content[1].type).toBe('paragraph')
+      expect(json.content[1].content).toEqual([])
+
+      // A single empty paragraph between content paragraphs uses blank-line
+      // spacing (the first empty paragraph doesn't need an &nbsp; marker).
+      const serialized = markdownManager.serialize(json)
+      expect(serialized).toBe('Line1\n\n\n\nLine2')
+    })
+
+    it('should roundtrip literal &amp;nbsp; without it being treated as an empty paragraph marker', () => {
+      // A user writing &amp;nbsp; in markdown intends for the text "&nbsp;" to display.
+      // decodeHtmlEntities decodes &amp; → &, producing text content "&nbsp;".
+      // On serialization, encodeHtmlEntities re-encodes & → &amp;, restoring &amp;nbsp;.
+      // The intermediate "&nbsp;" text must NOT be confused with the empty-paragraph marker.
+      const markdown = 'before &amp;nbsp; after'
+      const json = markdownManager.parse(markdown)
+
+      expect(json.content).toHaveLength(1)
+      expect(json.content[0].type).toBe('paragraph')
+      expect(json.content[0].content[0].text).toBe('before &nbsp; after')
+
+      const serialized = markdownManager.serialize(json)
+      expect(serialized).toBe('before &amp;nbsp; after')
+    })
+  })
 })
diff --git a/packages/markdown/src/MarkdownManager.ts b/packages/markdown/src/MarkdownManager.ts
@@ -9,6 +9,9 @@ import {
   type MarkdownToken,
   type MarkdownTokenizer,
   type RenderContext,
+  callOrReturn,
+  decodeHtmlEntities,
+  encodeHtmlEntities,
   flattenExtensions,
   generateJSON,
   getExtensionField,
@@ -34,6 +37,8 @@ export class MarkdownManager {
   private indentSize: number
   private baseExtensions: AnyExtension[] = []
   private extensions: AnyExtension[] = []
+  /** Set of extension names whose `code` spec property is truthy (nodes and marks). */
+  private codeTypes: Set<string> = new Set()
 
   /**
    * Create a MarkdownManager.
@@ -99,7 +104,15 @@ export class MarkdownManager {
     // Keep track of all extensions for HTML parsing
     this.extensions.push(extension)
 
+    // Track extensions that declare `code: true` so we can skip HTML entity
+    // encoding inside code contexts without hardcoding specific type names.
+    const isCode = callOrReturn(getExtensionField(extension, 'code'))
+
     const name = extension.name
+
+    if (isCode) {
+      this.codeTypes.add(name)
+    }
     const tokenName =
       (getExtensionField(extension, 'markdownTokenName') as ExtendableConfig['markdownTokenName']) || name
     const parseMarkdown = getExtensionField(extension, 'parseMarkdown') as ExtendableConfig['parseMarkdown'] | undefined
@@ -631,9 +644,10 @@ export class MarkdownManager {
       const token = tokens[i]
 
       if (token.type === 'text') {
+        // Create text node – decode HTML entities so that e.g. `&lt;` displays as `<` in the editor
         result.push({
           type: 'text',
-          text: token.text || '',
+          text: decodeHtmlEntities(token.text || ''),
         })
       } else if (token.type === 'html') {
         // Handle possible split inline HTML by attempting to detect an
@@ -797,7 +811,7 @@ export class MarkdownManager {
       case 'text':
         return {
           type: 'text',
-          text: token.text || '',
+          text: decodeHtmlEntities(token.text || ''),
         }
 
       case 'html':
@@ -875,6 +889,18 @@ export class MarkdownManager {
     }
   }
 
+  /**
+   * Encode HTML entities in text unless the node is inside a code context
+   * (code mark or code-block parent) where literal characters should be preserved.
+   */
+  private encodeTextForMarkdown(text: string, node: JSONContent, parentNode?: JSONContent): string {
+    const isInsideCode =
+      (parentNode?.type != null && this.codeTypes.has(parentNode.type)) ||
+      (node.marks || []).some(m => this.codeTypes.has(typeof m === 'string' ? m : m.type))
+
+    return isInsideCode ? text : encodeHtmlEntities(text)
+  }
+
   renderNodeToMarkdown(
     node: JSONContent,
     parentNode?: JSONContent,
@@ -885,7 +911,7 @@ export class MarkdownManager {
     // if node is a text node, we simply return it's text content
     // marks are handled at the array level in renderNodesWithMarkBoundaries
     if (node.type === 'text') {
-      return node.text || ''
+      return this.encodeTextForMarkdown(node.text || '', node, parentNode)
     }
 
     if (!node.type) {
@@ -982,7 +1008,7 @@ export class MarkdownManager {
       }
 
       if (node.type === 'text') {
-        let textContent = node.text || ''
+        let textContent = this.encodeTextForMarkdown(node.text || '', node, parentNode)
         const currentMarks = new Map((node.marks || []).map(mark => [mark.type, mark]))
 
         // Find marks that need to be closed and opened