python - Trouble with getting data into JSON dict from html file -
i'm trying scrape website , far code has gotten me copied html code below. however, want access variable tablelist, 'defjson:' part , parse data in there. looks .split('\n') not working, splitting on spaces not work because data in defjson has random spaces. there other way access defjson?
<script type="text/javascript"> function _dozoom(t){ setcookie(fontcookiename,h,9999); var a=document.getelementsbytagname("dl"); var k=[],c=[],g=[]; var b=a.length; for(var f=0;f<b;f++){ if(a[f].getelementsbytagname("span").length>0){ g.push(a[f]);k.push(a[f].getelementsbytagname("span")[0]); c.push(a[f].getelementsbytagname("span")[1]) } } b=g.length; var e=document.getelementbyid("combinationscontainer"); switch(parseint(h)){ case 0:e.style.fontsize="14px"; for(var d=0;d<b;d++){ k[d].classname="at";c[d].classname="" }break; case 1:e.style.fontsize="12px"; for(var d=0;d<b;d++){k[d].classname="";c[d].classname="at" }break } } var tablelist = new loadtable({ id: "dt_1", sort: { id: "12", desc: true }, cells: [{ "n": "股票<font class=\"ssp\" style=\"text-decoration: underline;\">代码</font>", "s": "5" }, { "n": "股票<font class=\"ssp2\">简称</font>", "w": "58" }, { "n": "相关资料", "w": "120" }, { "n": "申购<br />代码", "w": "44" }, { "n": "发行<br />总数<br />(万股)", "s": "7" }, { "n": "网上<br />发行<br />(万股)", "s": "8" }, { "n": "顶格申购<br />需配市值<br />(万元)<i title=\"黑色斜体部分为预估值,顶格申购需配市值的预估不涉及公司基本面分析,请谨慎参考。\"> </i>" }, dataurl: "http://datainterface.eastmoney.com/em_datacenter/js.aspx?type=ns&sty=nsst&st={sorttype}&sr={sortrule}&p={page}&ps={pagesize}&js=var {jsname}={pages:(pc),data:[(x)]}{param}", ... defjson:{pages:18,data:[",23772,80199130,万里石,002785,002785,50000000.00,20000000.00,,,,2015-07-08,2015-07-13,,,,,,,zxb,500.00,,,,,,http://topic.eastmoney.com/xmwlsipo/,中高端石材综合服务商‚一直专注于建筑装饰石材及景观石材的研发设计、生产和销售。,2.29,2.00,4.59,,0,,2.29,2,4.58,20.00,待上市,,,,,an201506250010008785,22.9,41.15"
def scrape(): htmltext = open('test.html','r').read() bs = beautifulsoup(htmltext) scripts = bs.find_all('script')[-3] print scripts
so far code gets above part of html file tried adding in:
line in scripts.split('\n'): if (line.startswith('defjson: ')): jsonstr=line.replace('defjson: ','') datadict = json.loads(jsonstr)
but gives me nonetype error there better way access defjson: part , load dictionary?
Comments
Post a Comment