Python implementation of the Piwik HTTP API
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tracker.py 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. # -*- coding: utf-8 -*-
  2. from hashlib import md5
  3. import json
  4. import logging
  5. import os
  6. import time
  7. import datetime
  8. import requests
  9. try:
  10. from urllib.parse import urljoin, urlencode
  11. except ImportError:
  12. from urlparse import urljoin
  13. from urllib import urlencode
  14. PARAMETERS = {
  15. # required parameters
  16. 'url': 'url', # The full URL for the current action.
  17. # recommended parameters
  18. 'action_name': 'action_name', # The title of the action being tracked.
  19. 'referer': 'urlref', # The full HTTP Referrer URL.
  20. 'visit_custom_vars': '_cvar', # Visit scope custom variables.
  21. 'visit_count': '_idvc', # The current count of visits for this visitor.
  22. 'view_timestamp': '_viewts', # The UNIX timestamp of this visitor's previous visit.
  23. 'first_visit_timestamp': '_idts', # The UNIX timestamp of this visitor's first visit.
  24. 'campaign_name': '_rcn', # The Campaign name (see Tracking Campaigns).
  25. 'campaign_keywords': '_rck', # The Campaign Keyword (see Tracking Campaigns).
  26. 'resolution': 'res', # The resolution of the device the visitor is using, eg 1280x1024.
  27. 'hour': 'h', # The current hour (local time).
  28. 'minute': 'm', # The current minute (local time).
  29. 'second': 's', # The current second (local time).
  30. 'flash': 'fla', # Flash,
  31. 'java': 'java', # Java
  32. 'director': 'dir', # Director,
  33. 'quicktime': 'qt', # Quicktime,
  34. 'real_player': 'realp', # Real Player,
  35. 'pdf': 'pdf', # PDF
  36. 'wma': 'wma', # Windows Media
  37. 'gears': 'gears', # Gears
  38. 'silverlight': 'ag', # Silverlight
  39. 'cookie': 'cookie', # when set to 1, the visitor's client is known to support cookies.
  40. 'user_agent': 'ua', # An override value for the User-Agent HTTP header field.
  41. 'lang': 'lang', # An override value for the Accept-Language HTTP header field. This value is used to detect the visitor's country if GeoIP is not enabled.
  42. 'user_id': 'uid', # defines the User ID for this request. User ID is any non empty unique string identifying the user (such as an email address or a username).
  43. 'visitor_id': 'cid', # defines the visitor ID for this request.
  44. 'new_visit': 'new_visit', # If set to 1, will force a new visit to be created for this action. This feature is also available in Javascript.
  45. # 'Optional Action info (measure Page view, Outlink, Download, Site search)',
  46. 'page_custom_vars': 'cvar', # Page scope custom variables.
  47. 'link': 'link', # An external URL the user has opened. Used for tracking outlink clicks. We recommend to also set the url parameter to this same value.
  48. 'download': 'download', # URL of a file the user has downloaded. Used for tracking downloads. We recommend to also set the url parameter to this same value.
  49. 'search_keyword': 'search', # The Site Search keyword. When specified, the request will not be tracked as a normal pageview but will instead be tracked as a Site Search request.
  50. 'search_category': 'search_cat', # when search is specified, you can optionally specify a search category with this parameter.
  51. 'search_count': 'search_count', # when search is specified, we also recommend to set the search_count to the number of search results displayed on the results page.
  52. 'goal_id': 'idgoal', # If specified, the tracking request will trigger a conversion for the goal of the website being tracked with this ID.
  53. 'revenue': 'revenue', # A monetary value that was generated as revenue by this goal conversion. Only used if idgoal is specified in the request.
  54. 'gt_ms': 'gt_ms', # The amount of time it took the server to generate this action, in milliseconds.
  55. 'charset': 'cs', # The charset of the page being tracked. Specify the charset if the data you send to Piwik is encoded in a different character set than the default utf-8.
  56. # Optional Event Tracking info
  57. 'event_category': 'e_c', # The event category. Must not be empty. (eg. Videos, Music, Games...)
  58. 'event_action': 'e_a', # The event action. Must not be empty. (eg. Play, Pause, Duration, Add Playlist, Downloaded, Clicked...)
  59. 'event_name': 'e_n', # The event name. (eg. a Movie name, or Song name, or File name...)
  60. 'event_value': 'e_v', # The event value. Must be a float or integer value (numeric), not a string.
  61. # Optional Content Tracking info
  62. 'content_name': 'c_n', # The name of the content. For instance 'Ad Foo Bar'
  63. 'content_piece': 'c_p', # The actual content piece. For instance the path to an image, video, audio, any text
  64. 'content_target': 'c_t', # The target of the content. For instance the URL of a landing page
  65. 'content_interaction': 'c_i', # The name of the interaction with the content. For instance a 'click'
  66. # Other parameters (require authentication via token_auth)
  67. 'token_auth': 'token_auth', # 32 character authorization key used to authenticate the API request.
  68. 'client_ip': 'cip', # Override value for the visitor IP (both IPv4 and IPv6 notations supported).
  69. 'client_dt': 'cdt', # Override for the datetime of the request (normally the current time is used).
  70. 'country': 'country', # An override value for the country. Should be set to the two letter country code of the visitor (lowercase), eg fr, de, us.
  71. 'region': 'region', # An override value for the region. Should be set to the two letter region code as defined by MaxMind's GeoIP databases.
  72. 'city': 'city', # An override value for the city. The name of the city the visitor is located in, eg, Tokyo.
  73. 'lat': 'lat', # An override value for the visitor's latitude, eg 22.456.
  74. 'long': 'long', # An override value for the visitor's longitude, eg 22.456.
  75. 'track_bots': 'bots', # Set to true to track bots
  76. 'heartbeat_timer': None, # Set to a positive integer to enable the heartbeat timer
  77. }
  78. AUTH_RESTRICTED_PARAMS = ('token_auth', 'client_ip', 'client_dt', 'country', 'region', 'city', 'lat', 'long')
  79. class PiwikTracker(object):
  80. """The PiwikTracker class is the base client for tracking visits."""
  81. API_VERSION = 1
  82. def __init__(self, piwik_url, site_id, request=None, values=None, **kwargs):
  83. """Creates a new PiwikTracker instance
  84. :param piwik_url The full qualified url to the tracking script (e.g. http://example.com/piwik/piwik.php)
  85. :param site_id The Piwik site id
  86. :param request (optional) a request object to copy values from
  87. :param values (optional) a dictionary with default tracking variables to use with this tracker instance
  88. :param kwargs (optiona) kwd arguments with default tracking variables to use with this tracker instance
  89. :rtype: A PiwikTracker instance
  90. """
  91. super(PiwikTracker, self).__init__()
  92. self.piwik_url = piwik_url
  93. self.idsite = site_id
  94. # initialize all tracking variables on this instance
  95. values = values or {}
  96. values.update(kwargs)
  97. self.update(dict((p, values.get(p, None)) for p in PARAMETERS.keys()))
  98. self.visit_custom_vars = {}
  99. self.page_custom_vars = {}
  100. self.spoof_request = True
  101. # defaults for the requests module
  102. self.request_headers = {}
  103. self.requests_arguments = {
  104. 'timeout': 3
  105. }
  106. # default filenames for the tracker file and the js file
  107. self.piwik_php_file = 'piwik.php'
  108. self.piwik_js_file = 'piwik.js'
  109. self.update_from_request(request)
  110. def update(self, values):
  111. for property_name in PARAMETERS.keys():
  112. if property_name in values:
  113. setattr(self, property_name, values[property_name])
  114. @property
  115. def php_url(self):
  116. return urljoin(self.piwik_url, self.piwik_php_file)
  117. @property
  118. def js_url(self):
  119. return urljoin(self.piwik_url, self.piwik_js_file)
  120. def update_from_request(self, request):
  121. """
  122. Initializes the current tracker instance from a Django-like requests object.
  123. If the request argument is None or does not have a dict as the META attribute, this function does nothing.
  124. """
  125. if not request:
  126. return
  127. meta = getattr(request, 'META', {})
  128. if not isinstance(meta, dict):
  129. return
  130. self.user_agent = meta.get('HTTP_USER_AGENT', None)
  131. self.referer = meta.get('HTTP_REFERER', None)
  132. self.lang = meta.get('HTTP_ACCEPT_LANGUAGE', None)
  133. if hasattr(request, 'build_absolute_uri'):
  134. bau = request.build_absolute_uri
  135. if callable(bau):
  136. self.url = bau()
  137. def _get_client_ip():
  138. if 'HTTP_X_FORWARDED_FOR' in meta:
  139. return meta['HTTP_X_FORWARDED_FOR'].split(",")[0]
  140. else:
  141. return meta.get('REMOTE_ADDR', None)
  142. self.client_ip = _get_client_ip()
  143. def _build_cvars(self, value):
  144. """
  145. Converts a custom vars dictionary to it's JSON representation usable for the Piwik API.
  146. """
  147. if not value:
  148. return None
  149. d = {}
  150. for i, item in enumerate(value.items(), start=1):
  151. d[i] = list(item)
  152. return json.dumps(d)
  153. def _build_parameters(self, **kwargs):
  154. d = {
  155. 'idsite': self.idsite,
  156. 'rec': '1',
  157. 'apiv': PiwikTracker.API_VERSION,
  158. }
  159. for property_name, parameter_name in PARAMETERS.items():
  160. if not parameter_name:
  161. continue
  162. value = kwargs.get(property_name, None) or getattr(self, property_name, None)
  163. token_auth = kwargs.get('token_auth', None) or getattr(self, 'token_auth', None)
  164. if value and property_name in AUTH_RESTRICTED_PARAMS and not token_auth:
  165. logging.info("Skipping %s because token_auth not set" % property_name)
  166. continue
  167. if value is None:
  168. continue
  169. if isinstance(value, bool):
  170. value = 1 if value else 0
  171. elif isinstance(value, datetime.datetime):
  172. if not value.tzinfo:
  173. logging.warning("Passing a naive datetime may result in wrong data. Make sure you pass a datetime object with UTC timezone")
  174. value = value.strftime('%Y-%m-%d %H:%M:%S')
  175. if property_name in ('page_custom_vars', 'visit_custom_vars'):
  176. value = self._build_cvars(value)
  177. if not value:
  178. continue
  179. d[parameter_name] = value
  180. return d
  181. def build_request_headers(self, params):
  182. headers = {
  183. 'Accept': '*/*',
  184. 'Accept-Encoding': 'gzip, deflate',
  185. }
  186. headers.update(self.request_headers)
  187. if self.spoof_request:
  188. # this is only used for server-to-server calls. By putting the values into the HTTP headers and dropping
  189. # them from the payload we will transfer less to the server while carrying the same information.
  190. for p, h in (('ua', 'User-Agent'), ('lang', 'Accept-Language'), ('urlref', 'Referer')):
  191. if p in params:
  192. headers[h] = params[p]
  193. del params[p]
  194. return headers
  195. def track_page_view(self, **kwargs):
  196. """Tracks a single page view with Piwik. The tracking variables are built from the the current instance values
  197. dictionary and the kwargs (if any)"""
  198. params = self._build_parameters(**kwargs)
  199. if '_id' not in params:
  200. params['_id'] = md5(os.urandom(16)).hexdigest()[:15]
  201. if '_idts' not in params:
  202. params['_idts'] = int(time.time())
  203. headers = self.build_request_headers(params)
  204. logging.debug("Tracking variables: %s" % params)
  205. logging.debug("Tracking headers: %s" % headers)
  206. try:
  207. response = requests.post(self.php_url, data=params, headers=headers, **self.requests_arguments)
  208. logging.debug("Tracking response: %s" % response)
  209. except:
  210. logging.exception("Tracking request failed")
  211. def track_page_view_bulk(self, tracking_vars, **kwargs):
  212. """Tracks multiple page views at once using Piwik bulk tracking API. The tracking variables for each single
  213. page view to track are built from the corresponding dictionary in tracking_vars, the current instance values
  214. dictionary and the given kwargs (if any).
  215. The caller is responsible to provide the tracking variables in tracking_vars in chronologically order (oldest
  216. first).
  217. :param tracking_vars a list of dicts with the tracking variables
  218. """
  219. bulk_data = []
  220. for vars in tracking_vars:
  221. d = kwargs.copy()
  222. d.update(vars)
  223. params = self._build_parameters(**d)
  224. if '_id' not in params:
  225. params['_id'] = md5(os.urandom(16)).hexdigest()[:15]
  226. if '_idts' not in params:
  227. params['_idts'] = int(time.time())
  228. bulk_data.append("?" + urlencode(params))
  229. try:
  230. data = {'requests': bulk_data}
  231. logging.debug("Tracking variables: %s" % bulk_data)
  232. response = requests.post(self.php_url,
  233. data=json.dumps(data),
  234. headers=self.build_request_headers({}),
  235. **self.requests_arguments)
  236. logging.debug("Bulk tracking response: %s" % response)
  237. except:
  238. logging.exception("Bulk tracking request failed")
  239. def tracking_code(self, **kwargs):
  240. return TrackingCodeBuilder(self).render(self._build_parameters(**kwargs))
  241. class TrackingCodeBuilder(object):
  242. template = """<script type="text/javascript">
  243. var _paq = _paq || [];
  244. {custom_vars}
  245. {event_tracking}
  246. {js_vars}
  247. _paq.push(['trackPageView']);
  248. _paq.push(['enableLinkTracking']);
  249. (function() {{
  250. _paq.push(['setTrackerUrl', '{tracker_url}']);
  251. _paq.push(['setSiteId', {idsite}]);
  252. var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
  253. g.type='text/javascript'; g.async=true; g.defer=true; g.src='{javascript_url}'; s.parentNode.insertBefore(g,s);
  254. }})();
  255. </script>
  256. <noscript><p><img src="{tracker_url}?{tracking_args}" style="border:0;" alt="" /></p></noscript>"""
  257. def __init__(self, tracker):
  258. self.tracker = tracker
  259. def _paq_push(self, l):
  260. # python3's filter() does not return a list
  261. return "_paq.push(%s);" % json.dumps(list(l))
  262. def _event_tracker(self):
  263. if not (self.tracker.event_category and self.tracker.event_action):
  264. return ""
  265. l = ['trackEvent', self.tracker.event_category, self.tracker.event_action, self.tracker.event_name, self.tracker.event_value]
  266. return self._paq_push(filter(lambda x: x, l))
  267. def _custom_vars(self):
  268. def _inner():
  269. for d, scope in (self.tracker.page_custom_vars, 'page'), (self.tracker.visit_custom_vars, 'visit'):
  270. for i, item in enumerate(d.items(), start=1):
  271. if i > 5:
  272. break
  273. k, v = item
  274. l = ['setCustomVariable', i, k, v, scope]
  275. yield self._paq_push(l)
  276. return '\n'.join(_inner())
  277. def _common_vars(self, params):
  278. def _inner():
  279. extra_tracking_params = {}
  280. if 'url' in params:
  281. yield self._paq_push(['setCustomUrl', params['url']])
  282. if 'urlref' in params:
  283. yield self._paq_push(['setReferrerUrl', params['urlref']])
  284. if 'action_name' in params:
  285. yield self._paq_push(['setDocumentTitle', params['action_name']])
  286. if 'new_visit' in params and params['new_visit']: # http://piwik.org/faq/how-to/#faq_187
  287. extra_tracking_params['new_visit'] = 1
  288. yield self._paq_push(["deleteCookies"])
  289. if self.tracker.heartbeat_timer and int(self.tracker.heartbeat_timer) > 0:
  290. yield self._paq_push(['enableHeartBeatTimer', self.tracker.heartbeat_timer])
  291. if 'bots' in params and params['bots']:
  292. extra_tracking_params['bots'] = 1
  293. if extra_tracking_params:
  294. yield self._paq_push(['appendToTrackingUrl', urlencode(extra_tracking_params)])
  295. return '\n'.join(_inner())
  296. def render(self, params):
  297. # remove all tracking variables which doesn't do any good when used with the image or javascript
  298. # tracking api
  299. for x in ['url', 'ua', 'lang'] + [PARAMETERS[x] for x in AUTH_RESTRICTED_PARAMS]:
  300. if x in params:
  301. del params[x]
  302. return TrackingCodeBuilder.template.format(tracker_url=self.tracker.php_url,
  303. javascript_url=self.tracker.js_url,
  304. idsite=self.tracker.idsite,
  305. tracking_args=urlencode(params),
  306. event_tracking=self._event_tracker(),
  307. custom_vars=self._custom_vars(),
  308. js_vars=self._common_vars(params),
  309. )