Legendre prime counting function: Difference between revisions

→‎{{header|CoffeeScript}}: add Chapel non-memoizing versions above...
m (→‎{{header|F#}}: tweak last summary comment...)
(→‎{{header|CoffeeScript}}: add Chapel non-memoizing versions above...)
Line 107:
10^9 50847534
</pre>
 
=={{header|Chapel}}==
 
There doesn't seem to be much point to contributing memoized (Hash table) versions as they seem to be the result of the task author not realizing that there is a simple way of keeping the computation complexity from having exponential growth with counting range by limiting the "splitting" of the "phi" calculation when the result must be just one. Accordingly, the following Chapel versions are [translated and improved from the non-memoizing Nim versions](https://rosettacode.org/wiki/Legendre_prime_counting_function#Non-Memoized_Versions), which explanations can be referenced for an understanding of how they work:
 
'''The Basic Recursive Legendre Algorithm'''
{{trans|Nim}}
<syntaxhighlight lang="chapel">// compile with --fast for maximum speed...
 
use Time;
 
proc countPrimes(lmt: uint(64)): int(64) {
if lmt < 9 { // when there are no odd primes less than square root...
if lmt < 3 { if lmt < 2 { return 0; } else { return 1; } }
return (lmt - (lmt >> 1)): int(64);
}
 
// Chapel doesn't have closures, so emulate them with a class...
class LegendrePi {
var n: uint(64);
var dom: domain(1);
var oprms: [dom] uint(32);
proc init(n: uint(64)) {
// first, an array of odd primes to the square root of n is generated...
this.n = n;
const sqrtn = sqrt(n: real(64)): int(64);
const rtlmt = (sqrtn - 3) / 2; this.dom = {0 .. rtlmt};
this.oprms = 0;
for i in 0 .. rtlmt do this.oprms[i] = (i + i + 3): uint(32);
var i = 0;
for i in (0 ..) { // cull the array
var ci = (i + i) * (i + 3) + 3; if ci > rtlmt { break; }
const bp = i + i + 3;
while (ci <= rtlmt) { this.oprms[ci] = 0; ci += bp; }
}
var psz = 0;
for ti in 0 .. rtlmt { // compress the odd primes array...
const tv = this.oprms[ti];
if tv != 0 { this.oprms[psz] = tv; psz += 1; }
}
this.dom = { 0 ..< psz };
}
proc phi(x: uint(64), a: int): int(64) {
if a <= 0 { return (x - (x >> 1)): int(64); } // take care of prime of 2
const na = a - 1; const p = this.oprms[na]: uint(64);
if x <= p { return 1: int(64); }
return phi(x, na) - phi(x / p, na);
}
proc this(): int(64) {
return phi(n, this.oprms.size) + this.oprms.size: int(64);
}
}
return (new LegendrePi(lmt))();
}
 
proc main() {
var timer: Timer;
timer.start();
 
for i in 0 .. 9 {
writeln("π(10**", i, ") = ", countPrimesx(10: uint(64) ** i));
}
 
timer.stop();
 
writeln("This took ", timer.elapsed(TimeUnits.milliseconds), " milliseconds.");
}</syntaxhighlight>
{{out}}
<pre>π(10**0) = 0
π(10**1) = 4
π(10**2) = 25
π(10**3) = 168
π(10**4) = 1229
π(10**5) = 9592
π(10**6) = 78498
π(10**7) = 664579
π(10**8) = 5761455
π(10**9) = 50847534
This took 276.431 milliseconds.</pre>
The time is as run on an Intel i5-6500 (3.6 GHz single-threaded boost).
 
Although this is about ten times faster than if it were using memoization (and uses much less memory), it is only reasonably usable for trivial ranges such as this (trivial in terms of prime counting functions).
 
'''The Less Recursive "Bottom-Up" with a TinyPhi LUT Algorithm'''
 
Just substitute the following Chapel code for the `countPrimes` function above to enjoy the gain in speed:
{{trans|Nim}}
<syntaxhighlight lang="chapel">// tiny Phi Look Up for `a` of small degree...
const tinyPhiPrimes = [ 2, 3, 5, 7, 11, 13 ]; // degree six
const cC = tinyPhiPrimes.size - 1;
proc product(a: [] int): int {
var acc = 1; for v in a { acc *= v; }; return acc >> 1; }
const tinyPhiOddCirc = product(tinyPhiPrimes);
proc tot(a: [] int): int {
var acc = 1; for v in a { acc *= v - 1; }; return acc; }
const tinyPhiOddTot = tot(tinyPhiPrimes);
proc makeTinyLUT(ps: [] int, sz: int): [] uint(32) {
var arr: [0 .. sz - 1] uint(32) = 1;
for p in ps {
if p <= 2 { continue; }
arr[p >> 1] = 0;
for c in ((p * p) >> 1) ..< sz by p { arr[c] = 0; }
}
var acc = 0: uint(32);
for i in 0 ..< sz { acc += arr[i]; arr[i] = acc; }
return arr;
}
const tinyPhiLUT = makeTinyLUT(tinyPhiPrimes, tinyPhiOddCirc);
inline proc tinyPhi(x: uint(64)): int(64) {
const ndx = (x - 1) >> 1; const numtot = ndx / tinyPhiOddCirc: uint(64);
return (numtot * tinyPhiOddTot +
tinyPhiLUT[(ndx - numtot * tinyPhiOddCirc): int]): int(64);
}
 
proc countPrimes(lmt: uint(64)): int(64) {
if lmt < 169 { // below 169 whose sqrt is 13 is where TinyPhi doesn't work...
if lmt < 3 { if lmt < 2 { return 0; } else { return 1; } }
// adjust for the missing "degree" base primes
if lmt <= 13 {
return ((lmt - 1): int(64) >> 1) + (if (lmt < 9) then 1 else 0); }
return 5 + tinyPhiLUT[(lmt - 1): int >> 1]: int(64);
}
 
// Chapel doesn't have closures, so emulate them with a class...
class LegendrePi {
var n: uint(64);
var dom: domain(1);
var oprms: [dom] uint(32);
proc init(n: uint(64)) {
// first, an array of odd primes to the square root of n is generated...
this.n = n;
const sqrtn = sqrt(n: real(64)): int(64);
const rtlmt = (sqrtn - 3) / 2; this.dom = {0 .. rtlmt};
this.oprms = 0;
for i in 0 .. rtlmt do this.oprms[i] = (i + i + 3): uint(32);
var i = 0;
for i in (0 ..) { // cull the array
var ci = (i + i) * (i + 3) + 3; if ci > rtlmt { break; }
const bp = i + i + 3;
while (ci <= rtlmt) { this.oprms[ci] = 0; ci += bp; }
}
var psz = 0;
for ti in 0 .. rtlmt { // compress the odd primes array...
const tv = this.oprms[ti];
if tv != 0 { this.oprms[psz] = tv; psz += 1; }
}
this.dom = { 0 ..< psz };
}
proc lvl(pilmt: int, m: uint(64)): int(64) {
var acc = 0: int(64);
for pi in cC ..< pilmt {
const p = this.oprms[pi]: uint(64); const nm = m * p;
if this.n <= nm * p { return acc + (pilmt - pi); }
if pi > cC { acc -= this.lvl(pi, nm); }
const q = this.n / nm; acc += tinyPhi(q);
}
return acc;
}
proc this(): int(64) {
return tinyPhi(this.n) - this.lvl(this.oprms.size, 1)
+ this.oprms.size: int(64);
}
}
return (new LegendrePi(lmt))();
}</syntaxhighlight>
 
Use of the above function gets a gain in speed of about a further ten times over the above version due to less recursion and the use of the "Tiny_Phi" Look Up Table (LUT) for smaller "degrees" of primes which greatly reduces the number of integer divisions. This version of prime counting function might be considered to be reasonably useful for counting primes up to a trillion (1e12) in a few tens of seconds.
 
'''The Non-Recursive Partial Sieving Algorithm'''
 
Just substitute the following Chapel code for the `countPrimes` function above to enjoy the further gain in speed:
{{trans|Nim}}
<syntaxhighlight lang="chapel">const masks = for i in 0 .. 7 do (1 << i): uint(8); // faster bit twiddling
 
proc countPrimes(lmt: uint(64)): int(64) {
if lmt < 3 { if lmt < 2 { return 0; } else { return 1; } } // odds only!
inline proc half(x: int): int { return (x - 1) >> 1; } // convenience function
inline proc divide(nm: uint(64), d: uint(64)): int {
return (nm: real(64) / d: real(64)): int; } // floating point div faster
const sqrtn = sqrt(lmt: real(64)): uint(64);
const mxndx = (sqrtn - 1): int / 2;
const dom = {0 .. mxndx}; const csz = (mxndx + 8) / 8;
var smalls = for i in dom do i: uint(32);
var roughs = for i in dom do (i + i + 1): uint(32);
var larges = for i in dom do ((lmt / (i + i + 1)) - 1) >> 1;
var cullbuf: [0 ..< csz] uint(8);
 
// partial sieve loop, adjusting larges/smalls, compressing larges/roughs...
var nobps = 0; var rilmt = mxndx;
for bp in 3: uint(64) .. by 2 {
const i = (bp >> 1): int; const sqri = (i + i) * (i + 1);
if sqri > mxndx { break; } // up to quad root of counting range
if (cullbuf[i >> 3] & masks[i & 7]) != 0 { continue; } // loop not prime
cullbuf[i >> 3] |= masks[i & 7]; // cull bp itself as not a rough
for ci in sqri .. mxndx by bp { // do partial sieving pass for `bp`...
cullbuf[ci >> 3] |= masks[ci & 7]; } // cull all multiples of `bp`
 
// now adjust `larges` for latest partial sieve pass...
var ori = 0; // compress input rough index to output one
for iri in 0 .. rilmt {
const r = roughs[iri]: uint(64); const rci = (r >> 1): int;
if (cullbuf[rci >> 3] & masks[rci & 7]) != 0 {
continue; } // skip culled roughs in last partial sieving pass
const d = bp: uint(64) * r;
larges[ori] = larges[iri] -
(if d <= sqrtn then
larges[smalls[(d >> 1): int] - nobps]
else smalls[half(divide(lmt, d))]: uint(64)) + nobps;
roughs[ori] = r: uint(32); ori += 1;
}
 
var si = mxndx; // and adjust `smalls` for latest partial sieve pass...
for bpm in bp .. (sqrtn / bp - 1) | 1 by -2 {
const c = smalls[(bpm >> 1): int] - nobps: uint(32);
const e = ((bpm * bp) >> 1): int;
while si >= e { smalls[si] -= c; si -= 1; }
}
 
nobps += 1; rilmt = ori - 1;
}
 
var ans = larges[0]; // answer from larges, adjusting for over subtraction...
for i in 1 .. rilmt { ans -= larges[i]; } // combine!
ans += (rilmt + 1 + 2 * (nobps - 1)) * rilmt / 2; // adjust!
 
// add final adjustment for pairs of current roughs to cube root of range...
for ri in (1 ..) { // break when reaches cube root of counting range...
const p = roughs[ri]: uint(64); const q = lmt / p;
const ei = smalls[half(divide(q, p))]: int - nobps;
if ei <= ri { break; } // break here when no more pairs!
for ori in ri + 1 .. ei { // for all pairs never the same prime!
ans += smalls[half(divide(q, roughs[ori]))]: int(64); }
// adjust for over subtractions above...
ans -= (ei - ri): uint(64) * (nobps: uint(64) + ri: uint(64) - 1);
}
 
return ans: int(64) + 1; // add one for only even prime of two!
}</syntaxhighlight>
 
The above function enjoys quite a large gain in speed of about a further ten times over the immediately preceding version due to not using recursion at all and the greatly reduced computational complexity of O(n**(3/4)/((log n)**2)) instead of the almost-linear-for-large-counting-ranges O(n/((log n)**2)) for the previous two versions, although the previous two versions differ in performance by a constant factor due to the overheads of recursion and division; for instance, this version can calculate the number of primes to 1e11 in about 22 milliseconds. This version of prime counting function might be considered to be reasonably useful for counting primes up to a 1e16 in under a minute as long as the computer used has the required free memory of about 800 Megabytes. This Chapel version is about the same speed as the Nim version from which it was translated other than the LLVM back-end used here produces code that is a few percent slower than as that produced by the GCC compiler used as a back-end by Nim, at least for this algorithm.
 
=={{header|CoffeeScript}}==
Line 158 ⟶ 398:
10^9 50847534
</pre>
 
=={{header|Crystal}}==
 
474

edits