You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

71 lines
4.0 KiB
R

pdf2df <- function(pdffile) {
# Function for extracting information from ICDD PDF XML-files
# For example the PDF files produced by the PDF database at Angstrom's X-ray lab
# NOTE: sometimes intensity values are specified as less than some value.
# In those cases, the lt sign will be preserved in the column int.Tex.
# The intensity column, on the other hand, is numeric and so strips off the lt sign.
# ARGS: pdffile (complete path and filename to PDF file)
# VALUE: dataframe with 9 columns:
# thth angles (numeric),
# d (numeric),
# h index (numeric),
# k index (numeric),
# l index (numeric),
# hkl indices (string),
# hkl.TeX indices formatted for LaTeX (string),
# intensity (numeric),
# int.TeX intensity formatted for LaTeX (string),
# pdfNumber (string)
# attr: This function sets the following attributes:
# ApplicationName,
# ApplicationVersion,
# chemicalformula,
# empiricalformula,
# wavelength
#
require(XML)
doc <- xmlTreeParse(pdffile)
pdf <- xmlRoot(doc)
rmchar <- "[^0-9]"
#
angles <- data.frame(NULL)
for (i in 1:length(pdf[["graphs"]][["stick_series"]])) {
angles <- rbind(angles, data.frame(stringsAsFactors = FALSE,#
thth = as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["theta"]])),
d = as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["da"]])),
h = as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["h"]])),
k = as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["k"]])),
l = as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["l"]])),
hkl = paste(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["h"]]),
xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["k"]]),
xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["l"]]), sep = ""),
hkl.TeX = paste("\\mbox{$", ifelse(as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["h"]])) < 0,
paste("\\bar{", abs(as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["h"]]))),
"}", sep = ""),
xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["h"]])),
"\\,", ifelse(as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["k"]])) < 0,
paste("\\bar{", abs(as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["k"]]))),
"}", sep = ""),
xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["k"]])),
"\\,", ifelse(as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["l"]])) < 0,
paste("\\bar{", abs(as.numeric(xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["l"]]))),
"}", sep = ""),
xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["l"]])),
"$}", sep = "", collapse = ""),
intensity = as.numeric(gsub(rmchar, "", xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["intensity"]]))),
int.TeX = paste("{", xmlValue(pdf[["graphs"]][["stick_series"]][[i]][["intensity"]]), "}", sep = ""),
pdfNumber = xmlValue(pdf[["pdf_data"]][["pdf_number"]]),
formula = gsub("[ ]", "", xmlValue(pdf[["pdf_data"]][["empirical_formula"]]))
))
}
#
attr(angles, "ApplicationName") <- xmlAttrs(pdf)[[1]]
attr(angles, "ApplicationVersion") <- xmlAttrs(pdf)[[2]]
#attr(angles, "pdfNumber") <- xmlValue(pdf[["pdf_data"]][["pdf_number"]])
attr(angles, "chemicalformula") <- gsub("[ ]", "", xmlValue(pdf[["pdf_data"]][["chemical_formula"]]))
attr(angles, "empiricalformula") <- gsub("[ ]", "", xmlValue(pdf[["pdf_data"]][["empirical_formula"]]))
attr(angles, "wavelength") <- as.numeric(xmlValue(pdf[["graphs"]][["wave_length"]]))
# Caution: Do not subset. Subsetting causes all attributes to be lost.
return(angles)
}