返回信息流怎么用Java高效地实现下面Reuters21578DocumentProvider的功能?
那是reuters21578测试集中的一个SGML文档(简单地转成了XML)。xml文件里面有1000个<REUTERS>标签。每个结构类似:
<REUTERS>
<TOPICS>
<D>主题1</D><D>主题2</D>
</TOPICS>
<TEXT>
....
<BODY>
这里是正文部分。
</BODY>
</TEXT>
</REUTERS>
要做的是提取主题和正文部分。
直接用DOM操作有点繁琐。如果用XPath从<REUTERS>里选取TOPICS/D和TEXT/BODY/text(),速度就会慢下来,每秒钟只能处理30个REUTER标签,不知道问题出在哪里。
处理整个文件,下面的Scala程序只要2-5秒钟执行时间,但Java+XPath需要将近半分钟。
package xml
import scala.xml._
import scala.reflect._
import java.io._
import scala.collection.JavaConversions._
class Document {
var text = "";
val topics = new java.util.LinkedList[String];
}
object Reuters21578DocumentProvider {
@BeanProperty
val docList = new java.util.LinkedList[Document];
// Primary Constructor
{
val file = new File("reuters21578/reut2-000.xml");
val reutersXMLDoc = XML.loadFile(file);
for (val n <- reutersXMLDoc \ "REUTERS") {
val doc = new Document;
val topics = n \ "TOPICS" \ "D";
for(val topic <- topics) {
doc.topics.add(topic.text);
}
doc.text = n \ "TEXT" \ "BODY" text;
docList.add(doc);
}
}
}
object ReutersPrinter {
def main(args: Array[String]): Unit = {
for (val doc <- Reuters21578DocumentProvider.docList) {
println(doc.text.substring(0,Math.min(doc.text.length, 30)).replaceAll("\\n"," ")
+": "
+doc.topics.mkString(","));
}
}
}
这是一条镜像帖。来源:北邮人论坛 / java / #14396同步于 2010/4/30
该镜像源已超过 30 天没有更新,可能在源站已被删除。
Java机器人发帖
Java & XML
wks
2010/4/30镜像同步5 回复
订阅后,新回复会通过你的通知中心匿名送达。
5 条回复
编译了。
下面这个代码,FastVersion(被注释)部分和Scala一样快;但是SlowVersion部分非常慢。
package xml;
import java.util.*;
import java.io.*;
import javax.xml.parsers.*;
import javax.xml.xpath.*;
import org.xml.sax.SAXException;
import org.w3c.dom.*;
public class JavaSlowVersion {
private List<Document> docList = new LinkedList<Document>();
public List<Document> getDocList() {
return docList;
}
public void setDocList(List<Document> docList) {
this.docList = docList;
}
public JavaSlowVersion() throws ParserConfigurationException,
SAXException, IOException, XPathExpressionException {
File file = new File("reuters21578/reut2-000.xml");
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
org.w3c.dom.Document d = db.parse(file);
XPathFactory xpf = XPathFactory.newInstance();
XPath xp = xpf.newXPath();
XPathExpression xpExprTopics = xp.compile("TOPICS/D");
XPathExpression xpExprText = xp.compile("TEXT/BODY/text()");
d.getDocumentElement().normalize();
NodeList reutersNodeList = d.getElementsByTagName("REUTERS");
for (int i = 0; i < reutersNodeList.getLength(); i++) {
Element reutersElem = (Element) reutersNodeList.item(i);
Document doc = new Document();
//Fast version
// Element textElem = (Element) reutersElem.getElementsByTagName(
// "TEXT").item(0);
// Element bodyElem = (Element) textElem.getElementsByTagName("BODY")
// .item(0);
//
// if (bodyElem == null) {
// doc.setText("");
// } else {
// doc.setText(bodyElem.getTextContent());
// }
// Slow version
doc.setText((String) xpExprText.evaluate(reutersElem, XPathConstants.STRING));
// Fast version
// Element topicsElem = (Element) reutersElem.getElementsByTagName(
// "TOPICS").item(0);
// NodeList dList = topicsElem.getElementsByTagName("D");
// Slow version
NodeList dList = (NodeList) xpExprTopics.evaluate(reutersElem, XPathConstants.NODESET);
for (int j = 0; j < dList.getLength(); j++) {
Element dElem = (Element) dList.item(j);
String topic = dElem.getTextContent();
doc.getTopics().add(topic);
}
docList.add(doc);
}
}
public static void main(String[] args) throws ParserConfigurationException,
SAXException, IOException, XPathExpressionException {
JavaSlowVersion ds = new JavaSlowVersion();
for (Document d : ds.getDocList()) {
System.out.print(d.getText().substring(0, Math.min(30,d.getText().length()))
.replaceAll("\\n", " ")
+ ": ");
for (String topic : (List<String>) d.getTopics()) {
System.out.print(topic + ",");
}
System.out.println();
}
}
}
// dom4j的XPath API足够优雅。而且速度也快。
package xml;
import java.util.*;
import java.io.*;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.dom4j.Element;
public class Dom4jVersion {
private List<Document> docList = new LinkedList<Document>();
public List<Document> getDocList() {
return docList;
}
public void setDocList(List<Document> docList) {
this.docList = docList;
}
@SuppressWarnings("unchecked")
public Dom4jVersion() throws DocumentException {
File file = new File("reuters21578/reut2-000.xml");
SAXReader saxReader = new SAXReader();
org.dom4j.Document xmlDoc = saxReader.read(file);
xmlDoc.normalize();
for(Element reutersElem : (List<Element>)xmlDoc.selectNodes("/LEWIS/REUTERS")) {
Document doc = new Document();
Element textElem = (Element) reutersElem.selectSingleNode("TEXT/BODY");
if(textElem != null) {
doc.setText(textElem.getText());
}
for(Element topicElem : (List<Element>)reutersElem.selectNodes("TOPICS/D")) {
doc.getTopics().add(topicElem.getText());
}
docList.add(doc);
}
}
public static void main(String[] args) throws DocumentException {
Dom4jVersion ds = new Dom4jVersion();
for (Document d : ds.getDocList()) {
System.out.print(d.getText().substring(0, Math.min(30,d.getText().length()))
.replaceAll("\\n", " ")
+ ": ");
for (String topic : (List<String>) d.getTopics()) {
System.out.print(topic + ",");
}
System.out.println();
}
}
}
主要问题是dom4j用的是jaxen的xpath实现,速度很快。
package xml;
import java.util.*;
import java.io.*;
import javax.xml.parsers.*;
import org.jaxen.dom.*;
import org.jaxen.*;
import org.w3c.dom.*;
import org.xml.sax.SAXException;
public class JaxenVersion {
private List<Document> docList = new LinkedList<Document>();
public List<Document> getDocList() {
return docList;
}
public void setDocList(List<Document> docList) {
this.docList = docList;
}
public JaxenVersion() throws ParserConfigurationException,
IOException, SAXException, JaxenException {
File file = new File("reuters21578/reut2-000.xml");
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
org.w3c.dom.Document d = db.parse(file);
XPath xpExprReuters = new DOMXPath("/LEWIS/REUTERS");
XPath xpExprTopics = new DOMXPath("TOPICS/D");
XPath xpExprText = new DOMXPath("TEXT/BODY");
d.getDocumentElement().normalize();
for (Element reutersElem : (List<Element>)xpExprReuters.evaluate(d)) {
Document doc = new Document();
List<Element> bodyElems =(List<Element>) xpExprText.evaluate(reutersElem);
if(bodyElems.size() > 0) {
doc.setText(((Element)bodyElems.get(0)).getTextContent());
}
for (Element dElem : (List<Element>) xpExprTopics.evaluate(reutersElem)) {
String topic = dElem.getTextContent();
doc.getTopics().add(topic);
}
docList.add(doc);
}
}
public static void main(String[] args) throws ParserConfigurationException,
SAXException, IOException, JaxenException {
JaxenVersion ds = new JaxenVersion();
for (Document d : ds.getDocList()) {
System.out.print(d.getText().substring(0, Math.min(30,d.getText().length()))
.replaceAll("\\n", " ")
+ ": ");
for (String topic : (List<String>) d.getTopics()) {
System.out.print(topic + ",");
}
System.out.println();
}
}
}
【 在 wks 的大作中提到: 】
: 主要问题是dom4j用的是jaxen的xpath实现,速度很快。
: package xml;
: import java.util.*;
: ...................
多么邪恶的签名档~