Skip to content

Commit 73eb202

Browse files
committed
Two fixes:
1. When there was split of a multiscripts for chemistry, it didn't get put back together properly. Need to be careful at start of line. 2. For atomic numbers, give more credit for the mass number if it is withen a plausable range (from searches, "nuclear drip line" estimated values based on atomic number).
1 parent 26891bb commit 73eb202

File tree

1 file changed

+67
-57
lines changed

1 file changed

+67
-57
lines changed

src/chemistry.rs

Lines changed: 67 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ pub fn scan_and_mark_chemistry(mathml: Element) -> bool {
458458
}
459459
}
460460
}
461-
debug!("...after marking:\n{}", mml_to_string(child));
461+
// debug!("...after marking:\n{}", mml_to_string(child));
462462

463463
if child.attribute(CHEM_FORMULA).is_none() && child.attribute(CHEM_EQUATION).is_none() {
464464
if !has_maybe_chemistry(mathml) {
@@ -590,48 +590,52 @@ fn is_changed_after_unmarking_chemistry(mathml: Element) -> bool {
590590
// could be no preceding children to canonicalization creating mrows (see issue #303), so might need to use parent, etc
591591
while preceding_children.is_empty() {
592592
preceding_children = parent.preceding_siblings();
593-
parent = get_parent(parent);
594593
if name(parent) == "math" {
595-
panic!("is_changed_after_unmarking_chemistry: error no preceding children to merge. mathml=\n{}", mml_to_string(mathml));
594+
break; // consider {SIN}^{-1} -- no preceding child
596595
}
596+
parent = get_parent(parent);
597597
}
598598

599-
// deal with the first element (if it needs unwrapping, it has only prescripts)
600-
let first_element_of_split = as_element(preceding_children[preceding_children.len()-1]);
601-
// debug!("first_element_of_split: \n{}", mml_to_string(first_element_of_split));
602-
if name(first_element_of_split) == "mmultiscripts" {
603-
// take the base and make it the first child of preceding_children (what will get merged)
604-
// put the rest of the elements (the prescripts) at the end of the parent last element (mathml) which must be an mmultiscripts
605-
let first_element_children = first_element_of_split.children();
606-
assert_eq!(name(mathml), "mmultiscripts");
607-
let mut script_children = mathml.children();
608-
assert_eq!(name(as_element(script_children[0])), "mi");
609-
assert!(!script_children.len().is_multiple_of(2)); // doesn't have <mprescripts/>
610-
script_children.push(first_element_children[1]); // mprescripts
611-
script_children.push(first_element_children[2]); // prescripts subscript
612-
script_children.push(first_element_children[3]); // prescripts superscript
613-
614-
let base_of_first_element = first_element_children[0]; // base
615-
assert_eq!(name(as_element(base_of_first_element)), "mi");
616-
let script_base = as_element(script_children[0]);
617-
let mut merged_base_text = as_text( as_element(base_of_first_element)).to_string();
618-
merged_base_text.push_str(as_text(script_base));
619-
script_base.set_text(&merged_base_text);
620-
script_base.remove_attribute("mathvariant");
621-
script_base.remove_attribute(ADDED_ATTR_VALUE);
622-
script_base.remove_attribute(MAYBE_CHEMISTRY);
623-
script_base.remove_attribute(SPLIT_TOKEN);
624-
mathml.replace_children(script_children);
625-
626-
first_element_of_split.remove_from_parent();
627-
return true;
599+
let mut new_script_children = vec![];
600+
if !preceding_children.is_empty() {
601+
// deal with the first element (if it needs unwrapping, it has only prescripts)
602+
let first_element_of_split = as_element(preceding_children[preceding_children.len()-1]);
603+
// debug!("first_element_of_split: \n{}", mml_to_string(first_element_of_split));
604+
if name(first_element_of_split) == "mmultiscripts" {
605+
// take the base and make it the first child of preceding_children (what will get merged)
606+
// put the rest of the elements (the prescripts) at the end of the parent last element (mathml) which must be an mmultiscripts
607+
let first_element_children = first_element_of_split.children();
608+
assert_eq!(name(mathml), "mmultiscripts");
609+
let mut script_children = mathml.children();
610+
assert_eq!(name(as_element(script_children[0])), "mi");
611+
assert!(!script_children.len().is_multiple_of(2)); // doesn't have <mprescripts/>
612+
script_children.push(first_element_children[1]); // mprescripts
613+
script_children.push(first_element_children[2]); // prescripts subscript
614+
script_children.push(first_element_children[3]); // prescripts superscript
615+
616+
let base_of_first_element = first_element_children[0]; // base
617+
assert_eq!(name(as_element(base_of_first_element)), "mi");
618+
let script_base = as_element(script_children[0]);
619+
let mut merged_base_text = as_text( as_element(base_of_first_element)).to_string();
620+
merged_base_text.push_str(as_text(script_base));
621+
script_base.set_text(&merged_base_text);
622+
script_base.remove_attribute("mathvariant");
623+
script_base.remove_attribute(ADDED_ATTR_VALUE);
624+
script_base.remove_attribute(MAYBE_CHEMISTRY);
625+
script_base.remove_attribute(SPLIT_TOKEN);
626+
mathml.replace_children(script_children);
627+
628+
first_element_of_split.remove_from_parent();
629+
return true;
630+
}
631+
new_script_children.push(ChildOfElement::Element(first_element_of_split));
628632
}
633+
debug!("mathml after handling preceding children:\n{}", mml_to_string(mathml));
629634
let mut children_of_script = mathml.children();
630635
let split_child = as_element(children_of_script[0]);
631-
let mut new_script_children = vec![ChildOfElement::Element(first_element_of_split)];
632636
new_script_children.append(&mut children_of_script);
633637
mathml.replace_children(new_script_children); // temporarily has bad number of children
634-
// debug!("After making bad script:\n{}", mml_to_string(mathml));
638+
debug!("After making bad script:\n{}", mml_to_string(mathml));
635639
if let Err(err) = merge_element(split_child) {
636640
panic!("{}", err);
637641
}
@@ -1370,7 +1374,7 @@ pub fn likely_adorned_chem_formula(mathml: Element) -> isize {
13701374

13711375
let mut empty_superscript = false;
13721376
if tag_name == "msup" || tag_name == "msubsup" {
1373-
// debug!("likely_adorned_chem_formula: mathml\n{}", mml_to_string(mathml));
1377+
debug!("likely_adorned_chem_formula: mathml\n{}", mml_to_string(mathml));
13741378
let superscript = as_element(children[if tag_name == "msup" {1} else {2}]);
13751379
empty_superscript = name(superscript) == "mtext" && as_text(superscript).trim().is_empty();
13761380
if !empty_superscript {
@@ -1411,25 +1415,30 @@ pub fn likely_adorned_chem_formula(mathml: Element) -> isize {
14111415
if is_adorned_electron(children[0], prescripts) {
14121416
return 100; // very likely chemistry
14131417
}
1414-
1418+
let base = as_element(children[0]);
1419+
let base_name = name(base);
1420+
let atomic_number = if matches!(base_name, "mi" | "mtext") &&
1421+
let Some(atomic_number) = CHEMICAL_ELEMENT_ATOMIC_NUMBER.get(as_text(base)) {
1422+
*atomic_number
1423+
} else {
1424+
return NOT_CHEMISTRY;
1425+
};
14151426
if pre_superscript_name == "mo" {
14161427
// Lewis dot prescript case
14171428
if pre_subscript_name != "none" {
14181429
return NOT_CHEMISTRY;
14191430
}
14201431
likelihood += likely_chem_superscript(pre_superscript);
14211432
} else if pre_superscript_name == "mn" { // must have a pre-superscript (neutrons + protons)
1422-
// fix could make sure they are integers
1423-
likelihood += 1; // looking like an atomic number
1424-
if pre_subscript_name == "mn" {
1425-
// make sure the atomic number matches the base
1426-
let base = as_element(children[0]);
1427-
let base_name = name(base);
1428-
if (base_name == "mi" || base_name == "mtext") &&
1429-
let Some(atomic_number) = CHEMICAL_ELEMENT_ATOMIC_NUMBER.get(as_text(base)) &&
1430-
as_text(pre_subscript) == atomic_number.to_string() {
1431-
likelihood = CHEMISTRY_THRESHOLD;
1432-
}
1433+
if let Some(mass) = as_text(pre_superscript).parse::<u32>().ok() {
1434+
// "drip line" is 1.5 * mass < 3.5 * mass -- it is possible to outside of this range, but VERY unlikely
1435+
// to avoid floating point, we multiply by 2 and compare to 3 and 7
1436+
if 3*atomic_number < 2*mass && 2*mass < 7*atomic_number {
1437+
likelihood += 3;
1438+
}
1439+
}
1440+
if pre_subscript_name == "mn" && as_text(pre_subscript) == atomic_number.to_string() {
1441+
likelihood = CHEMISTRY_THRESHOLD;
14331442
}
14341443
} else {
14351444
return NOT_CHEMISTRY;
@@ -1476,6 +1485,7 @@ pub fn likely_adorned_chem_formula(mathml: Element) -> isize {
14761485
likelihood += likely_chem_formula(base);
14771486
}
14781487

1488+
debug!("returning from likely_adorned_chem_formula: likelihood={}, mathml\n{}", likelihood, mml_to_string(mathml));
14791489
return likelihood;
14801490

14811491

@@ -2803,14 +2813,14 @@ mod chem_tests {
28032813
</mrow>
28042814
</mrow>
28052815
</math>";
2806-
let target = "<math>
2807-
<mmultiscripts data-previous-space-width='-0.083'>
2808-
<mi mathvariant='normal'>U</mi>
2809-
<mprescripts></mprescripts>
2810-
<none/>
2811-
<mn>238</mn>
2816+
let target = " <math>
2817+
<mmultiscripts data-previous-space-width='-0.083' data-chem-formula='5'>
2818+
<mi mathvariant='normal' data-chem-element='2'>U</mi>
2819+
<mprescripts></mprescripts>
2820+
<none></none>
2821+
<mn>238</mn>
28122822
</mmultiscripts>
2813-
</math>";
2823+
</math>";
28142824
assert!(are_strs_canonically_equal(test, target, &[]));
28152825
}
28162826

@@ -3024,15 +3034,15 @@ mod chem_tests {
30243034
</mrow>
30253035
</math>";
30263036
let target = "<math>
3027-
<mrow data-chem-formula='7'>
3028-
<mmultiscripts data-previous-space-width='-0.083' data-chem-formula='3'>
3037+
<mrow data-chem-formula='11'>
3038+
<mmultiscripts data-previous-space-width='-0.083' data-chem-formula='5'>
30293039
<mi mathvariant='normal' data-chem-element='2'>O</mi>
30303040
<mprescripts></mprescripts>
30313041
<none></none>
30323042
<mn>18</mn>
30333043
</mmultiscripts>
30343044
<mo data-changed='added' data-chem-formula-op='0'>&#x2063;</mo>
3035-
<mmultiscripts data-previous-space-width='0.027999999999999997' data-chem-formula='3'>
3045+
<mmultiscripts data-previous-space-width='0.027999999999999997' data-chem-formula='5'>
30363046
<mi mathvariant='normal' data-chem-element='2'>O</mi>
30373047
<mprescripts></mprescripts>
30383048
<none></none>

0 commit comments

Comments
 (0)