副标题#e#
上篇文章:http://www.voidcn.com/article/p-nsbrwwsu-zv.html?(挖掘DBLP作者合作关系,FP-Growth算法实践(1):从DBLP数据集中提取目标信息(会议、作者等))
大家反映代码不能用,主要是太慢了,好吧,我也承认慢,在内存构造树,肯定的!
这次给出另外两种。
为了完整,先给出dom:
#do not use this code! def DomParser(): domTree=parse(fileName) dblp=domTree.documentElement inproceedingsList=dblp.getElementsByTagName("inproceedings") for inproceedings in inproceedingsList: year=inproceedings.getElementsByTagName("year")[0] yearStr=str(year.childNodes[0].data) if yearStr<fromYear: continue print "yearStr",yearStr,"=="*20 booktitle=inproceedings.getElementsByTagName("booktitle")[0] booktitleStr=str(booktitle.childNodes[0].data) #for "<booktitle>ICML Unsupervised and Transfer Learning</booktitle>" booktitleStr=booktitleStr.split(" ")[0] if not confNameDict.has_key(booktitleStr): continue print "booktitleStr",booktitleStr,"^^"*20 #allList=[] #"confName \t year \t title \t author1|author2|..|authorn" #authorDict={} #author: [frequence,yearStart,yearEnd] allContent=booktitleStr+"\t"+yearStr+"\t" #confName \t year \t title=inproceedings.getElementsByTagName("title")[0] titleStr=str(title.childNodes[0].data) allContent+=titleStr+"\t" #title \t authorList=inproceedings.getElementsByTagName("author") for i,author in enumerate(authorList): authorStr=str(author.childNodes[0].data) allContent+=authorStr+"|" #authori| if authorDict.has_key(authorStr): authorDict[authorStr][0]+=1 if yearStr<authorDict[authorStr][1]: authorDict[authorStr][1]=yearStr elif yearStr>authorDict[authorStr][2]: authorDict[authorStr][2]=yearStr else: authorDict[authorStr]=[1,yearStr] allList.append(allContent) allContent="\n".join(allList) wf=open("allDB.txt","w") wf.write(allContent) wf.close() authorList=sorted(authorDict.items(),lambda x,y: cmp(x[1],y[1]),reverse=True) wf=open("authorDB.txt","w") allContent="\n".join([author+"\t"+str(frequence)+"\t"+yearStart+"\t"+yearEnd for author,(frequence,yearEnd) in authorList]) wf.write(allContent) wf.close()
再给出sax:
class SAX_PARSER(xml.sax.ContentHandler): ''' startDocument()方法 文档启动的时候调用。 endDocument()方法 解析器到达文档结尾时调用。 startElement(name,attrs)方法 遇到XML开始标签时调用,name是标签的名字,attrs是标签的属性值字典。 endElement(name)方法 遇到XML结束标签时调用。 characters(content)方法,调用时机: 从行开始,遇到标签之前,存在字符,content的值为这些字符串。 从一个标签,遇到下一个标签之前, 存在字符,content的值为这些字符串。 从一个标签,遇到行结束符之前,存在字符,content的值为这些字符串。 标签可以是开始标签,也可以是结束标签。 ''' def __init__(self): self.authorList="" self.title="" self.year="" self.booktitle="" self.flag=0 self.tag="" def startDocument(self): print "Document start","=="*20 def endDocument(self): print "Document end","=="*20 def startElement(self,tag,attributes): print "startElement","ss"*20,tag if tag=="inproceedings": self.flag=1 elif self.flag==1: #tag!="inproceedings" and self.flag==1,we are now in a subtag of "inproceedings" self.tag=tag def endElement(self,tag): print "endElement","ee"*20,tag if self.flag==1 and tag=="inproceedings": if confNameDict.has_key(self.booktitle) and self.year>=fromYear: #allList=[] #"confName \t year \t title \t author1|author2|..|authorn" allContent=self.booktitle+"\t"+self.year+"\t"+self.title+"\t"+self.authorList[:-1]+"\n" #for the last "|" wf=open("allDB.txt","a") wf.write(allContent) wf.close() self.authorList="" self.title="" self.year="" self.booktitle="" self.flag=0 self.tag="" def characters(self,content): print "characters","cc"*20,content if self.flag==1: #we are now in "inproceedings" tag print self.tag if self.tag=="author": self.authorList+=content+"|" elif self.tag=="title": self.title=content elif self.tag=="year": self.year=content elif self.tag=="booktitle": self.booktitle=content.split(" ")[0] #for "<booktitle>ICML Unsupervised and Transfer Learning</booktitle>"
最后给出string,把每行看成字符串来处理的方式:
#p#副标题#e##p#分页标题#e#
def XmlLineParser(fileName): rf=open(fileName,"r") for line in rf: #print "line [1]",line if line.startswith("<inproceedings"): print "line [1]",line booktitle="" year="" title="" authorList="" for line in rf: print "line [2]",line if line.startswith("<author"): authorList+=line if line.startswith("<title"): title=line elif line.startswith("<year"): year=line[6:10] if year<fromYear: break elif line.startswith("<booktitle"): booktitle=((line[11:]).split("</")[0]).split(" ")[0] if not confNameDict.has_key(booktitle): break elif line.startswith("</inproceedings"): #tranList=[] #"confName \t year \t title \t author1|author2|..|authorn" localTran=booktitle+"\t"+year+"\t"+(title[7:]).split("</")[0]+"\t" for authorLine in authorList.split("\n"): for author in re.findall(re.compile(r'<author>(.*)</author>',re.S),authorLine): localTran+=author+"|" wf=open("tranDB.txt","a") wf.write(localTran[:-1]+"\n") #remove last "|" wf.close() break #do not forget rf.close()
调用直接:
XmlLineParser(fileName)
拿走不谢,代码写得有点水,见谅。