123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393939493959396939793989399940094019402940394049405940694079408940994109411941294139414941594169417941894199420942194229423942494259426942794289429943094319432943394349435943694379438943994409441944294439444944594469447944894499450945194529453945494559456945794589459946094619462946394649465946694679468946994709471947294739474947594769477947894799480948194829483948494859486948794889489949094919492949394949495949694979498949995009501950295039504950595069507950895099510951195129513951495159516951795189519952095219522952395249525952695279528952995309531953295339534953595369537953895399540954195429543954495459546954795489549955095519552955395549555955695579558955995609561956295639564956595669567956895699570957195729573957495759576957795789579958095819582958395849585958695879588958995909591959295939594959595969597959895999600960196029603960496059606960796089609961096119612961396149615961696179618961996209621962296239624962596269627962896299630963196329633963496359636963796389639964096419642964396449645964696479648964996509651965296539654965596569657965896599660966196629663966496659666966796689669967096719672967396749675967696779678967996809681968296839684968596869687968896899690969196929693969496959696969796989699970097019702970397049705970697079708970997109711971297139714971597169717971897199720972197229723972497259726972797289729973097319732973397349735973697379738973997409741974297439744974597469747974897499750975197529753975497559756975797589759976097619762976397649765976697679768976997709771977297739774977597769777977897799780978197829783978497859786978797889789979097919792979397949795979697979798979998009801980298039804980598069807980898099810981198129813981498159816981798189819982098219822982398249825982698279828982998309831983298339834983598369837983898399840984198429843984498459846984798489849985098519852985398549855985698579858985998609861986298639864986598669867986898699870987198729873987498759876987798789879988098819882988398849885988698879888988998909891989298939894989598969897989898999900990199029903990499059906990799089909991099119912991399149915991699179918991999209921992299239924992599269927992899299930993199329933993499359936993799389939994099419942994399449945994699479948994999509951995299539954995599569957995899599960996199629963996499659966996799689969997099719972997399749975997699779978997999809981998299839984998599869987998899899990999199929993999499959996999799989999100001000110002100031000410005100061000710008100091001010011100121001310014100151001610017100181001910020100211002210023100241002510026100271002810029100301003110032100331003410035100361003710038100391004010041100421004310044100451004610047100481004910050100511005210053100541005510056100571005810059100601006110062100631006410065100661006710068100691007010071100721007310074100751007610077100781007910080100811008210083100841008510086100871008810089100901009110092100931009410095100961009710098100991010010101101021010310104101051010610107101081010910110101111011210113101141011510116101171011810119101201012110122101231012410125101261012710128101291013010131101321013310134101351013610137101381013910140101411014210143101441014510146101471014810149101501015110152101531015410155101561015710158101591016010161101621016310164101651016610167101681016910170101711017210173101741017510176101771017810179101801018110182101831018410185101861018710188101891019010191101921019310194101951019610197101981019910200102011020210203102041020510206102071020810209102101021110212102131021410215102161021710218102191022010221102221022310224102251022610227102281022910230102311023210233102341023510236102371023810239102401024110242102431024410245102461024710248102491025010251102521025310254102551025610257102581025910260102611026210263102641026510266102671026810269102701027110272102731027410275102761027710278102791028010281102821028310284102851028610287102881028910290102911029210293102941029510296102971029810299103001030110302103031030410305103061030710308103091031010311103121031310314103151031610317103181031910320103211032210323103241032510326103271032810329103301033110332103331033410335103361033710338103391034010341103421034310344103451034610347103481034910350103511035210353103541035510356103571035810359103601036110362103631036410365103661036710368103691037010371103721037310374103751037610377103781037910380103811038210383103841038510386103871038810389103901039110392103931039410395103961039710398103991040010401104021040310404104051040610407104081040910410104111041210413104141041510416104171041810419104201042110422104231042410425104261042710428104291043010431104321043310434104351043610437104381043910440104411044210443104441044510446104471044810449104501045110452104531045410455104561045710458104591046010461104621046310464104651046610467104681046910470104711047210473104741047510476104771047810479104801048110482104831048410485104861048710488104891049010491104921049310494104951049610497104981049910500105011050210503105041050510506105071050810509105101051110512105131051410515105161051710518105191052010521105221052310524105251052610527105281052910530105311053210533105341053510536105371053810539105401054110542105431054410545105461054710548105491055010551105521055310554105551055610557105581055910560105611056210563105641056510566105671056810569105701057110572105731057410575105761057710578105791058010581105821058310584105851058610587105881058910590105911059210593105941059510596105971059810599106001060110602106031060410605106061060710608106091061010611106121061310614106151061610617106181061910620106211062210623106241062510626106271062810629106301063110632106331063410635106361063710638106391064010641106421064310644106451064610647106481064910650106511065210653106541065510656106571065810659106601066110662106631066410665106661066710668106691067010671106721067310674106751067610677106781067910680106811068210683106841068510686106871068810689106901069110692106931069410695106961069710698106991070010701107021070310704107051070610707107081070910710107111071210713107141071510716107171071810719107201072110722107231072410725107261072710728107291073010731107321073310734107351073610737107381073910740107411074210743107441074510746107471074810749107501075110752107531075410755107561075710758107591076010761107621076310764107651076610767107681076910770107711077210773107741077510776107771077810779107801078110782107831078410785107861078710788107891079010791107921079310794107951079610797107981079910800108011080210803108041080510806108071080810809108101081110812108131081410815108161081710818108191082010821108221082310824108251082610827108281082910830108311083210833108341083510836108371083810839108401084110842108431084410845108461084710848108491085010851108521085310854108551085610857108581085910860108611086210863108641086510866108671086810869108701087110872108731087410875108761087710878108791088010881108821088310884108851088610887108881088910890108911089210893108941089510896108971089810899109001090110902109031090410905109061090710908109091091010911109121091310914109151091610917109181091910920109211092210923109241092510926109271092810929109301093110932109331093410935109361093710938109391094010941109421094310944109451094610947109481094910950109511095210953109541095510956109571095810959109601096110962109631096410965109661096710968109691097010971109721097310974109751097610977109781097910980109811098210983109841098510986109871098810989109901099110992109931099410995109961099710998109991100011001110021100311004110051100611007110081100911010110111101211013110141101511016110171101811019110201102111022110231102411025110261102711028110291103011031110321103311034110351103611037110381103911040110411104211043110441104511046110471104811049110501105111052110531105411055110561105711058110591106011061110621106311064110651106611067110681106911070110711107211073110741107511076110771107811079110801108111082110831108411085110861108711088110891109011091110921109311094110951109611097110981109911100111011110211103111041110511106111071110811109111101111111112111131111411115111161111711118111191112011121111221112311124111251112611127111281112911130111311113211133111341113511136111371113811139111401114111142111431114411145111461114711148111491115011151111521115311154111551115611157111581115911160111611116211163111641116511166111671116811169111701117111172111731117411175111761117711178111791118011181111821118311184111851118611187111881118911190111911119211193111941119511196111971119811199112001120111202112031120411205112061120711208112091121011211112121121311214112151121611217112181121911220112211122211223112241122511226112271122811229112301123111232112331123411235112361123711238112391124011241112421124311244112451124611247112481124911250112511125211253112541125511256112571125811259112601126111262112631126411265112661126711268112691127011271112721127311274112751127611277112781127911280112811128211283112841128511286112871128811289112901129111292112931129411295112961129711298112991130011301113021130311304113051130611307113081130911310113111131211313113141131511316113171131811319113201132111322113231132411325113261132711328113291133011331113321133311334113351133611337113381133911340113411134211343113441134511346113471134811349113501135111352113531135411355113561135711358113591136011361113621136311364113651136611367113681136911370113711137211373113741137511376113771137811379113801138111382113831138411385113861138711388113891139011391113921139311394113951139611397113981139911400114011140211403114041140511406114071140811409114101141111412114131141411415114161141711418114191142011421114221142311424114251142611427114281142911430114311143211433114341143511436114371143811439114401144111442114431144411445114461144711448114491145011451114521145311454114551145611457114581145911460114611146211463114641146511466114671146811469114701147111472114731147411475114761147711478114791148011481114821148311484114851148611487114881148911490114911149211493114941149511496114971149811499115001150111502115031150411505115061150711508115091151011511115121151311514115151151611517115181151911520115211152211523115241152511526115271152811529115301153111532115331153411535115361153711538115391154011541115421154311544115451154611547115481154911550115511155211553115541155511556115571155811559115601156111562115631156411565115661156711568115691157011571115721157311574115751157611577115781157911580115811158211583115841158511586115871158811589115901159111592115931159411595115961159711598115991160011601116021160311604116051160611607116081160911610116111161211613116141161511616116171161811619116201162111622116231162411625116261162711628116291163011631116321163311634116351163611637116381163911640116411164211643116441164511646116471164811649116501165111652116531165411655116561165711658116591166011661116621166311664116651166611667116681166911670116711167211673116741167511676116771167811679116801168111682116831168411685116861168711688116891169011691116921169311694116951169611697116981169911700117011170211703117041170511706117071170811709117101171111712117131171411715117161171711718117191172011721117221172311724117251172611727117281172911730117311173211733117341173511736117371173811739117401174111742117431174411745117461174711748117491175011751117521175311754117551175611757117581175911760117611176211763117641176511766117671176811769117701177111772117731177411775117761177711778117791178011781117821178311784117851178611787117881178911790117911179211793117941179511796117971179811799118001180111802118031180411805118061180711808118091181011811118121181311814118151181611817118181181911820118211182211823118241182511826118271182811829118301183111832118331183411835118361183711838118391184011841118421184311844118451184611847118481184911850118511185211853118541185511856118571185811859118601186111862118631186411865118661186711868118691187011871118721187311874118751187611877118781187911880118811188211883118841188511886118871188811889118901189111892118931189411895118961189711898118991190011901119021190311904119051190611907119081190911910119111191211913119141191511916119171191811919119201192111922119231192411925119261192711928119291193011931119321193311934119351193611937119381193911940119411194211943119441194511946119471194811949119501195111952119531195411955119561195711958119591196011961119621196311964119651196611967119681196911970119711197211973119741197511976119771197811979119801198111982119831198411985119861198711988119891199011991119921199311994119951199611997119981199912000120011200212003120041200512006120071200812009120101201112012120131201412015120161201712018120191202012021120221202312024120251202612027120281202912030120311203212033120341203512036120371203812039120401204112042120431204412045120461204712048120491205012051120521205312054120551205612057120581205912060120611206212063120641206512066120671206812069120701207112072120731207412075120761207712078120791208012081120821208312084120851208612087120881208912090120911209212093120941209512096120971209812099121001210112102121031210412105121061210712108121091211012111121121211312114121151211612117121181211912120121211212212123121241212512126121271212812129121301213112132121331213412135121361213712138121391214012141121421214312144121451214612147121481214912150121511215212153121541215512156121571215812159121601216112162121631216412165121661216712168121691217012171121721217312174121751217612177121781217912180121811218212183121841218512186121871218812189121901219112192121931219412195121961219712198121991220012201122021220312204122051220612207122081220912210122111221212213122141221512216122171221812219122201222112222122231222412225122261222712228122291223012231122321223312234122351223612237122381223912240122411224212243122441224512246122471224812249122501225112252122531225412255122561225712258122591226012261122621226312264122651226612267122681226912270122711227212273122741227512276122771227812279122801228112282122831228412285122861228712288122891229012291122921229312294122951229612297122981229912300123011230212303123041230512306123071230812309123101231112312123131231412315123161231712318123191232012321123221232312324123251232612327123281232912330123311233212333123341233512336123371233812339123401234112342123431234412345123461234712348123491235012351123521235312354123551235612357123581235912360123611236212363123641236512366123671236812369123701237112372123731237412375123761237712378123791238012381123821238312384123851238612387123881238912390123911239212393123941239512396123971239812399124001240112402124031240412405124061240712408124091241012411124121241312414124151241612417124181241912420124211242212423124241242512426124271242812429124301243112432124331243412435124361243712438124391244012441124421244312444124451244612447124481244912450124511245212453124541245512456124571245812459124601246112462124631246412465124661246712468124691247012471124721247312474124751247612477124781247912480124811248212483124841248512486124871248812489124901249112492124931249412495124961249712498124991250012501125021250312504125051250612507125081250912510125111251212513125141251512516125171251812519125201252112522125231252412525125261252712528125291253012531125321253312534125351253612537125381253912540125411254212543125441254512546125471254812549125501255112552125531255412555125561255712558125591256012561125621256312564125651256612567125681256912570125711257212573125741257512576125771257812579125801258112582125831258412585125861258712588125891259012591125921259312594125951259612597125981259912600126011260212603126041260512606126071260812609126101261112612126131261412615126161261712618126191262012621126221262312624126251262612627126281262912630126311263212633126341263512636126371263812639126401264112642126431264412645126461264712648126491265012651126521265312654126551265612657126581265912660126611266212663126641266512666126671266812669126701267112672126731267412675126761267712678126791268012681126821268312684126851268612687126881268912690126911269212693126941269512696126971269812699127001270112702127031270412705127061270712708127091271012711127121271312714127151271612717127181271912720127211272212723127241272512726127271272812729127301273112732127331273412735127361273712738127391274012741127421274312744127451274612747127481274912750127511275212753127541275512756127571275812759127601276112762127631276412765127661276712768127691277012771127721277312774127751277612777127781277912780127811278212783127841278512786127871278812789127901279112792127931279412795127961279712798127991280012801128021280312804128051280612807128081280912810128111281212813128141281512816128171281812819128201282112822128231282412825128261282712828128291283012831128321283312834128351283612837128381283912840128411284212843128441284512846128471284812849128501285112852128531285412855128561285712858128591286012861128621286312864128651286612867128681286912870128711287212873128741287512876128771287812879128801288112882128831288412885128861288712888128891289012891128921289312894128951289612897128981289912900129011290212903129041290512906129071290812909129101291112912129131291412915129161291712918129191292012921129221292312924129251292612927129281292912930129311293212933129341293512936129371293812939129401294112942129431294412945129461294712948129491295012951129521295312954129551295612957129581295912960129611296212963129641296512966129671296812969129701297112972129731297412975129761297712978129791298012981129821298312984129851298612987129881298912990129911299212993129941299512996129971299812999130001300113002130031300413005130061300713008130091301013011130121301313014130151301613017130181301913020130211302213023130241302513026130271302813029130301303113032130331303413035130361303713038130391304013041130421304313044130451304613047130481304913050130511305213053130541305513056130571305813059130601306113062130631306413065130661306713068130691307013071130721307313074130751307613077130781307913080130811308213083130841308513086130871308813089130901309113092130931309413095130961309713098130991310013101131021310313104131051310613107131081310913110131111311213113131141311513116131171311813119131201312113122131231312413125131261312713128131291313013131131321313313134131351313613137131381313913140131411314213143131441314513146131471314813149131501315113152131531315413155131561315713158131591316013161131621316313164131651316613167131681316913170131711317213173131741317513176131771317813179131801318113182131831318413185131861318713188131891319013191131921319313194131951319613197131981319913200132011320213203132041320513206132071320813209132101321113212132131321413215132161321713218132191322013221132221322313224132251322613227132281322913230132311323213233132341323513236132371323813239132401324113242132431324413245132461324713248132491325013251132521325313254132551325613257132581325913260132611326213263132641326513266132671326813269132701327113272132731327413275132761327713278132791328013281132821328313284132851328613287132881328913290132911329213293132941329513296132971329813299133001330113302133031330413305133061330713308133091331013311133121331313314133151331613317133181331913320133211332213323133241332513326133271332813329133301333113332133331333413335133361333713338133391334013341133421334313344133451334613347133481334913350133511335213353133541335513356133571335813359133601336113362133631336413365133661336713368133691337013371133721337313374133751337613377133781337913380133811338213383133841338513386133871338813389133901339113392133931339413395133961339713398133991340013401134021340313404134051340613407134081340913410134111341213413134141341513416134171341813419134201342113422134231342413425134261342713428134291343013431134321343313434134351343613437134381343913440134411344213443134441344513446134471344813449134501345113452134531345413455134561345713458134591346013461134621346313464134651346613467134681346913470134711347213473134741347513476134771347813479134801348113482134831348413485134861348713488134891349013491134921349313494134951349613497134981349913500135011350213503135041350513506135071350813509135101351113512135131351413515135161351713518135191352013521135221352313524135251352613527135281352913530135311353213533135341353513536135371353813539135401354113542135431354413545135461354713548135491355013551135521355313554135551355613557135581355913560135611356213563135641356513566135671356813569135701357113572135731357413575135761357713578135791358013581135821358313584135851358613587135881358913590135911359213593135941359513596135971359813599136001360113602136031360413605136061360713608136091361013611136121361313614136151361613617136181361913620136211362213623136241362513626136271362813629136301363113632136331363413635136361363713638136391364013641136421364313644136451364613647136481364913650136511365213653136541365513656136571365813659136601366113662136631366413665136661366713668136691367013671136721367313674136751367613677136781367913680136811368213683136841368513686136871368813689136901369113692136931369413695136961369713698136991370013701137021370313704137051370613707137081370913710137111371213713137141371513716137171371813719137201372113722137231372413725137261372713728137291373013731137321373313734137351373613737137381373913740137411374213743137441374513746137471374813749137501375113752137531375413755137561375713758137591376013761137621376313764137651376613767137681376913770137711377213773137741377513776137771377813779137801378113782137831378413785137861378713788137891379013791137921379313794137951379613797137981379913800138011380213803138041380513806138071380813809138101381113812138131381413815138161381713818138191382013821138221382313824138251382613827138281382913830138311383213833138341383513836138371383813839138401384113842138431384413845138461384713848138491385013851138521385313854138551385613857138581385913860138611386213863138641386513866138671386813869138701387113872138731387413875138761387713878138791388013881138821388313884138851388613887138881388913890138911389213893138941389513896138971389813899139001390113902139031390413905139061390713908139091391013911139121391313914139151391613917139181391913920139211392213923139241392513926139271392813929139301393113932139331393413935139361393713938139391394013941139421394313944139451394613947139481394913950139511395213953139541395513956139571395813959139601396113962139631396413965139661396713968139691397013971139721397313974139751397613977139781397913980139811398213983139841398513986139871398813989139901399113992139931399413995139961399713998139991400014001140021400314004140051400614007140081400914010140111401214013140141401514016140171401814019140201402114022140231402414025140261402714028140291403014031140321403314034140351403614037140381403914040140411404214043140441404514046140471404814049140501405114052140531405414055140561405714058140591406014061140621406314064140651406614067140681406914070140711407214073140741407514076140771407814079140801408114082140831408414085140861408714088140891409014091140921409314094140951409614097140981409914100141011410214103141041410514106141071410814109141101411114112141131411414115141161411714118141191412014121141221412314124141251412614127141281412914130141311413214133141341413514136141371413814139141401414114142141431414414145141461414714148141491415014151141521415314154141551415614157141581415914160141611416214163141641416514166141671416814169141701417114172141731417414175141761417714178141791418014181141821418314184141851418614187141881418914190141911419214193141941419514196141971419814199142001420114202142031420414205142061420714208142091421014211142121421314214142151421614217142181421914220142211422214223142241422514226142271422814229142301423114232142331423414235142361423714238142391424014241142421424314244142451424614247142481424914250142511425214253142541425514256142571425814259142601426114262142631426414265142661426714268142691427014271142721427314274142751427614277142781427914280142811428214283142841428514286142871428814289142901429114292142931429414295142961429714298142991430014301143021430314304143051430614307143081430914310143111431214313143141431514316143171431814319143201432114322143231432414325143261432714328143291433014331143321433314334143351433614337143381433914340143411434214343143441434514346143471434814349143501435114352143531435414355143561435714358143591436014361143621436314364143651436614367143681436914370143711437214373143741437514376143771437814379143801438114382143831438414385143861438714388143891439014391143921439314394143951439614397143981439914400144011440214403144041440514406144071440814409144101441114412144131441414415144161441714418144191442014421144221442314424144251442614427144281442914430144311443214433144341443514436144371443814439144401444114442144431444414445144461444714448144491445014451144521445314454144551445614457144581445914460144611446214463144641446514466144671446814469144701447114472144731447414475144761447714478144791448014481144821448314484144851448614487144881448914490144911449214493144941449514496144971449814499145001450114502145031450414505145061450714508145091451014511145121451314514145151451614517145181451914520145211452214523145241452514526145271452814529145301453114532145331453414535145361453714538145391454014541145421454314544145451454614547145481454914550145511455214553145541455514556145571455814559145601456114562145631456414565145661456714568145691457014571145721457314574145751457614577145781457914580145811458214583145841458514586145871458814589145901459114592145931459414595145961459714598145991460014601146021460314604146051460614607146081460914610146111461214613146141461514616146171461814619146201462114622146231462414625146261462714628146291463014631146321463314634146351463614637146381463914640146411464214643146441464514646146471464814649146501465114652146531465414655146561465714658146591466014661146621466314664146651466614667146681466914670146711467214673146741467514676146771467814679146801468114682146831468414685146861468714688146891469014691146921469314694146951469614697146981469914700147011470214703147041470514706147071470814709147101471114712147131471414715147161471714718147191472014721147221472314724147251472614727147281472914730147311473214733147341473514736147371473814739147401474114742147431474414745147461474714748147491475014751147521475314754147551475614757147581475914760147611476214763147641476514766147671476814769147701477114772147731477414775147761477714778147791478014781147821478314784147851478614787147881478914790147911479214793147941479514796147971479814799148001480114802148031480414805148061480714808148091481014811148121481314814148151481614817148181481914820148211482214823148241482514826148271482814829148301483114832148331483414835148361483714838148391484014841148421484314844148451484614847148481484914850148511485214853148541485514856148571485814859148601486114862148631486414865148661486714868148691487014871148721487314874148751487614877148781487914880148811488214883148841488514886148871488814889148901489114892148931489414895148961489714898148991490014901149021490314904149051490614907149081490914910149111491214913149141491514916149171491814919149201492114922149231492414925149261492714928149291493014931149321493314934149351493614937149381493914940149411494214943149441494514946149471494814949149501495114952149531495414955149561495714958149591496014961149621496314964149651496614967149681496914970149711497214973149741497514976149771497814979149801498114982149831498414985149861498714988149891499014991149921499314994149951499614997149981499915000150011500215003150041500515006150071500815009150101501115012150131501415015150161501715018150191502015021150221502315024150251502615027150281502915030150311503215033150341503515036150371503815039150401504115042150431504415045150461504715048150491505015051150521505315054150551505615057150581505915060150611506215063150641506515066150671506815069150701507115072150731507415075150761507715078150791508015081150821508315084150851508615087150881508915090150911509215093150941509515096150971509815099151001510115102151031510415105151061510715108151091511015111151121511315114151151511615117151181511915120151211512215123151241512515126151271512815129151301513115132151331513415135151361513715138151391514015141151421514315144151451514615147151481514915150151511515215153151541515515156151571515815159151601516115162151631516415165151661516715168151691517015171151721517315174151751517615177151781517915180151811518215183151841518515186151871518815189151901519115192151931519415195151961519715198151991520015201152021520315204152051520615207152081520915210152111521215213152141521515216152171521815219152201522115222152231522415225152261522715228152291523015231152321523315234152351523615237152381523915240152411524215243152441524515246152471524815249152501525115252152531525415255152561525715258152591526015261152621526315264152651526615267152681526915270152711527215273152741527515276152771527815279152801528115282152831528415285152861528715288152891529015291152921529315294152951529615297152981529915300153011530215303153041530515306153071530815309153101531115312153131531415315153161531715318153191532015321153221532315324153251532615327153281532915330153311533215333153341533515336153371533815339153401534115342153431534415345153461534715348153491535015351153521535315354153551535615357153581535915360153611536215363153641536515366153671536815369153701537115372153731537415375153761537715378153791538015381153821538315384153851538615387153881538915390153911539215393153941539515396153971539815399154001540115402154031540415405154061540715408154091541015411154121541315414154151541615417154181541915420154211542215423154241542515426154271542815429154301543115432154331543415435154361543715438154391544015441154421544315444154451544615447154481544915450154511545215453154541545515456154571545815459154601546115462154631546415465154661546715468154691547015471154721547315474154751547615477154781547915480154811548215483154841548515486154871548815489154901549115492154931549415495154961549715498154991550015501155021550315504155051550615507155081550915510155111551215513155141551515516155171551815519155201552115522155231552415525155261552715528155291553015531155321553315534155351553615537155381553915540155411554215543155441554515546155471554815549155501555115552155531555415555155561555715558155591556015561155621556315564155651556615567155681556915570155711557215573155741557515576155771557815579155801558115582155831558415585155861558715588155891559015591155921559315594155951559615597155981559915600156011560215603156041560515606156071560815609156101561115612156131561415615156161561715618156191562015621156221562315624156251562615627156281562915630156311563215633156341563515636156371563815639156401564115642156431564415645156461564715648156491565015651156521565315654156551565615657156581565915660156611566215663156641566515666156671566815669156701567115672156731567415675156761567715678156791568015681156821568315684156851568615687156881568915690156911569215693156941569515696156971569815699157001570115702157031570415705157061570715708157091571015711157121571315714157151571615717157181571915720157211572215723157241572515726157271572815729157301573115732157331573415735157361573715738157391574015741157421574315744157451574615747157481574915750157511575215753157541575515756157571575815759157601576115762157631576415765157661576715768157691577015771157721577315774157751577615777157781577915780157811578215783157841578515786157871578815789157901579115792157931579415795157961579715798157991580015801158021580315804158051580615807158081580915810158111581215813158141581515816158171581815819158201582115822158231582415825158261582715828158291583015831158321583315834158351583615837158381583915840158411584215843158441584515846158471584815849158501585115852158531585415855158561585715858158591586015861158621586315864158651586615867158681586915870158711587215873158741587515876158771587815879158801588115882158831588415885158861588715888158891589015891158921589315894158951589615897158981589915900159011590215903159041590515906159071590815909159101591115912159131591415915159161591715918159191592015921159221592315924159251592615927159281592915930159311593215933159341593515936159371593815939159401594115942159431594415945159461594715948159491595015951159521595315954159551595615957159581595915960159611596215963159641596515966159671596815969159701597115972159731597415975159761597715978159791598015981159821598315984159851598615987159881598915990159911599215993159941599515996159971599815999160001600116002160031600416005160061600716008160091601016011160121601316014160151601616017160181601916020160211602216023160241602516026160271602816029160301603116032160331603416035160361603716038160391604016041160421604316044160451604616047160481604916050160511605216053160541605516056160571605816059160601606116062160631606416065160661606716068160691607016071160721607316074160751607616077160781607916080160811608216083160841608516086160871608816089160901609116092160931609416095160961609716098160991610016101161021610316104161051610616107161081610916110161111611216113161141611516116161171611816119161201612116122161231612416125161261612716128161291613016131161321613316134161351613616137161381613916140161411614216143161441614516146161471614816149161501615116152161531615416155161561615716158161591616016161161621616316164161651616616167161681616916170161711617216173161741617516176161771617816179161801618116182161831618416185161861618716188161891619016191161921619316194161951619616197161981619916200162011620216203162041620516206162071620816209162101621116212162131621416215162161621716218162191622016221162221622316224162251622616227162281622916230162311623216233162341623516236162371623816239162401624116242162431624416245162461624716248162491625016251162521625316254162551625616257162581625916260162611626216263162641626516266162671626816269162701627116272162731627416275162761627716278162791628016281162821628316284162851628616287162881628916290162911629216293162941629516296162971629816299163001630116302163031630416305163061630716308163091631016311163121631316314163151631616317163181631916320163211632216323163241632516326163271632816329163301633116332163331633416335163361633716338163391634016341163421634316344163451634616347163481634916350163511635216353163541635516356163571635816359163601636116362163631636416365163661636716368163691637016371163721637316374163751637616377163781637916380163811638216383163841638516386163871638816389163901639116392163931639416395163961639716398163991640016401164021640316404164051640616407164081640916410164111641216413164141641516416164171641816419164201642116422164231642416425164261642716428164291643016431164321643316434164351643616437164381643916440164411644216443164441644516446164471644816449164501645116452164531645416455164561645716458164591646016461164621646316464164651646616467164681646916470164711647216473164741647516476164771647816479164801648116482164831648416485164861648716488164891649016491164921649316494164951649616497164981649916500165011650216503165041650516506165071650816509165101651116512165131651416515165161651716518165191652016521165221652316524165251652616527165281652916530165311653216533165341653516536165371653816539165401654116542165431654416545165461654716548165491655016551165521655316554165551655616557165581655916560165611656216563165641656516566165671656816569165701657116572165731657416575165761657716578165791658016581165821658316584165851658616587165881658916590165911659216593165941659516596165971659816599166001660116602166031660416605166061660716608166091661016611166121661316614166151661616617166181661916620166211662216623166241662516626166271662816629166301663116632166331663416635166361663716638166391664016641166421664316644166451664616647166481664916650166511665216653166541665516656166571665816659166601666116662166631666416665166661666716668166691667016671166721667316674166751667616677166781667916680166811668216683166841668516686166871668816689166901669116692166931669416695166961669716698166991670016701167021670316704167051670616707167081670916710167111671216713167141671516716167171671816719167201672116722167231672416725167261672716728167291673016731167321673316734167351673616737167381673916740167411674216743167441674516746167471674816749167501675116752167531675416755167561675716758167591676016761167621676316764167651676616767167681676916770167711677216773167741677516776167771677816779167801678116782167831678416785167861678716788167891679016791167921679316794167951679616797167981679916800168011680216803168041680516806168071680816809168101681116812168131681416815168161681716818168191682016821168221682316824168251682616827168281682916830168311683216833168341683516836168371683816839168401684116842168431684416845168461684716848168491685016851168521685316854168551685616857168581685916860168611686216863168641686516866168671686816869168701687116872168731687416875168761687716878168791688016881168821688316884168851688616887168881688916890168911689216893168941689516896168971689816899169001690116902169031690416905169061690716908169091691016911169121691316914169151691616917169181691916920169211692216923169241692516926169271692816929169301693116932169331693416935169361693716938169391694016941169421694316944169451694616947169481694916950169511695216953169541695516956169571695816959169601696116962169631696416965169661696716968169691697016971169721697316974169751697616977169781697916980169811698216983169841698516986169871698816989169901699116992169931699416995169961699716998169991700017001170021700317004170051700617007170081700917010170111701217013170141701517016170171701817019170201702117022170231702417025170261702717028170291703017031170321703317034170351703617037170381703917040170411704217043170441704517046170471704817049170501705117052170531705417055170561705717058170591706017061170621706317064170651706617067170681706917070170711707217073170741707517076170771707817079170801708117082170831708417085170861708717088170891709017091170921709317094170951709617097170981709917100171011710217103171041710517106171071710817109171101711117112171131711417115171161711717118171191712017121171221712317124171251712617127171281712917130171311713217133171341713517136171371713817139171401714117142171431714417145171461714717148171491715017151171521715317154171551715617157171581715917160171611716217163171641716517166171671716817169171701717117172171731717417175171761717717178171791718017181171821718317184171851718617187171881718917190171911719217193171941719517196171971719817199172001720117202172031720417205172061720717208172091721017211172121721317214172151721617217172181721917220172211722217223172241722517226172271722817229172301723117232172331723417235172361723717238172391724017241172421724317244172451724617247172481724917250172511725217253172541725517256172571725817259172601726117262172631726417265172661726717268172691727017271172721727317274172751727617277172781727917280172811728217283172841728517286172871728817289172901729117292172931729417295172961729717298172991730017301173021730317304173051730617307173081730917310173111731217313173141731517316173171731817319173201732117322173231732417325173261732717328173291733017331173321733317334173351733617337173381733917340173411734217343173441734517346173471734817349173501735117352173531735417355173561735717358173591736017361173621736317364173651736617367173681736917370173711737217373173741737517376173771737817379173801738117382173831738417385173861738717388173891739017391173921739317394173951739617397173981739917400174011740217403174041740517406174071740817409174101741117412174131741417415174161741717418174191742017421174221742317424174251742617427174281742917430174311743217433174341743517436174371743817439174401744117442174431744417445174461744717448174491745017451174521745317454174551745617457174581745917460174611746217463174641746517466174671746817469174701747117472174731747417475174761747717478174791748017481174821748317484174851748617487174881748917490174911749217493174941749517496174971749817499175001750117502175031750417505175061750717508175091751017511175121751317514175151751617517175181751917520175211752217523175241752517526175271752817529175301753117532175331753417535175361753717538175391754017541175421754317544175451754617547175481754917550175511755217553175541755517556175571755817559175601756117562175631756417565175661756717568175691757017571175721757317574175751757617577175781757917580175811758217583175841758517586175871758817589175901759117592175931759417595175961759717598175991760017601176021760317604176051760617607176081760917610176111761217613176141761517616176171761817619176201762117622176231762417625176261762717628176291763017631176321763317634176351763617637176381763917640176411764217643176441764517646176471764817649176501765117652176531765417655176561765717658176591766017661176621766317664176651766617667176681766917670176711767217673176741767517676176771767817679176801768117682176831768417685176861768717688176891769017691176921769317694176951769617697176981769917700177011770217703177041770517706177071770817709177101771117712177131771417715177161771717718177191772017721177221772317724177251772617727177281772917730177311773217733177341773517736177371773817739177401774117742177431774417745177461774717748177491775017751177521775317754177551775617757177581775917760177611776217763177641776517766177671776817769177701777117772177731777417775177761777717778177791778017781177821778317784177851778617787177881778917790177911779217793177941779517796177971779817799178001780117802178031780417805178061780717808178091781017811178121781317814178151781617817178181781917820178211782217823178241782517826178271782817829178301783117832178331783417835178361783717838178391784017841178421784317844178451784617847178481784917850178511785217853178541785517856178571785817859178601786117862178631786417865178661786717868178691787017871178721787317874178751787617877178781787917880178811788217883178841788517886178871788817889178901789117892178931789417895178961789717898178991790017901179021790317904179051790617907179081790917910179111791217913179141791517916179171791817919179201792117922179231792417925179261792717928179291793017931179321793317934179351793617937179381793917940179411794217943179441794517946179471794817949179501795117952179531795417955179561795717958179591796017961179621796317964179651796617967179681796917970179711797217973179741797517976179771797817979179801798117982179831798417985179861798717988179891799017991179921799317994179951799617997179981799918000180011800218003180041800518006180071800818009180101801118012180131801418015180161801718018180191802018021180221802318024180251802618027180281802918030180311803218033180341803518036180371803818039180401804118042180431804418045180461804718048180491805018051180521805318054180551805618057180581805918060180611806218063180641806518066180671806818069180701807118072180731807418075180761807718078180791808018081180821808318084180851808618087180881808918090180911809218093180941809518096180971809818099181001810118102181031810418105181061810718108181091811018111181121811318114181151811618117181181811918120181211812218123181241812518126181271812818129181301813118132181331813418135181361813718138181391814018141181421814318144181451814618147181481814918150181511815218153181541815518156181571815818159181601816118162181631816418165181661816718168181691817018171181721817318174181751817618177181781817918180181811818218183181841818518186181871818818189181901819118192181931819418195181961819718198181991820018201182021820318204182051820618207182081820918210182111821218213182141821518216182171821818219182201822118222182231822418225182261822718228182291823018231182321823318234182351823618237182381823918240182411824218243182441824518246182471824818249182501825118252182531825418255182561825718258182591826018261182621826318264182651826618267182681826918270182711827218273182741827518276182771827818279182801828118282182831828418285182861828718288182891829018291182921829318294182951829618297182981829918300183011830218303183041830518306183071830818309183101831118312183131831418315183161831718318183191832018321183221832318324183251832618327183281832918330183311833218333183341833518336183371833818339183401834118342183431834418345183461834718348183491835018351183521835318354183551835618357183581835918360183611836218363183641836518366183671836818369183701837118372183731837418375183761837718378183791838018381183821838318384183851838618387183881838918390183911839218393183941839518396183971839818399184001840118402184031840418405184061840718408184091841018411184121841318414184151841618417184181841918420184211842218423184241842518426184271842818429184301843118432184331843418435184361843718438184391844018441184421844318444184451844618447184481844918450184511845218453184541845518456184571845818459184601846118462184631846418465184661846718468184691847018471184721847318474184751847618477184781847918480184811848218483184841848518486184871848818489184901849118492184931849418495184961849718498184991850018501185021850318504185051850618507185081850918510185111851218513185141851518516185171851818519185201852118522185231852418525185261852718528185291853018531185321853318534185351853618537185381853918540185411854218543185441854518546185471854818549185501855118552185531855418555185561855718558185591856018561185621856318564185651856618567185681856918570185711857218573185741857518576185771857818579185801858118582185831858418585185861858718588185891859018591185921859318594185951859618597185981859918600186011860218603186041860518606186071860818609186101861118612186131861418615186161861718618186191862018621186221862318624186251862618627186281862918630186311863218633186341863518636186371863818639186401864118642186431864418645186461864718648186491865018651186521865318654186551865618657186581865918660186611866218663186641866518666186671866818669186701867118672186731867418675186761867718678186791868018681186821868318684186851868618687186881868918690186911869218693186941869518696186971869818699187001870118702187031870418705187061870718708187091871018711187121871318714187151871618717187181871918720187211872218723187241872518726187271872818729187301873118732187331873418735187361873718738187391874018741187421874318744187451874618747187481874918750187511875218753187541875518756187571875818759187601876118762187631876418765187661876718768187691877018771187721877318774187751877618777187781877918780187811878218783187841878518786187871878818789187901879118792187931879418795187961879718798187991880018801188021880318804188051880618807188081880918810188111881218813188141881518816188171881818819188201882118822188231882418825188261882718828188291883018831188321883318834188351883618837188381883918840188411884218843188441884518846188471884818849188501885118852188531885418855188561885718858188591886018861188621886318864188651886618867188681886918870188711887218873188741887518876188771887818879188801888118882188831888418885188861888718888188891889018891188921889318894188951889618897188981889918900189011890218903189041890518906189071890818909189101891118912189131891418915189161891718918189191892018921189221892318924189251892618927189281892918930189311893218933189341893518936189371893818939189401894118942189431894418945189461894718948189491895018951189521895318954189551895618957189581895918960189611896218963189641896518966189671896818969189701897118972189731897418975189761897718978189791898018981189821898318984189851898618987189881898918990189911899218993189941899518996189971899818999190001900119002190031900419005190061900719008190091901019011190121901319014190151901619017190181901919020190211902219023190241902519026190271902819029190301903119032190331903419035190361903719038190391904019041190421904319044190451904619047190481904919050190511905219053190541905519056190571905819059190601906119062190631906419065190661906719068190691907019071190721907319074190751907619077190781907919080190811908219083190841908519086190871908819089190901909119092190931909419095190961909719098190991910019101191021910319104191051910619107191081910919110191111911219113191141911519116191171911819119191201912119122191231912419125191261912719128191291913019131191321913319134191351913619137191381913919140191411914219143191441914519146191471914819149191501915119152191531915419155191561915719158191591916019161191621916319164191651916619167191681916919170191711917219173191741917519176191771917819179191801918119182191831918419185191861918719188191891919019191191921919319194191951919619197191981919919200192011920219203192041920519206192071920819209192101921119212192131921419215192161921719218192191922019221192221922319224192251922619227192281922919230192311923219233192341923519236192371923819239192401924119242192431924419245192461924719248192491925019251192521925319254192551925619257192581925919260192611926219263192641926519266192671926819269192701927119272192731927419275192761927719278192791928019281192821928319284192851928619287192881928919290192911929219293192941929519296192971929819299193001930119302193031930419305193061930719308193091931019311193121931319314193151931619317193181931919320193211932219323193241932519326193271932819329193301933119332193331933419335193361933719338193391934019341193421934319344193451934619347193481934919350193511935219353193541935519356193571935819359193601936119362193631936419365193661936719368193691937019371193721937319374193751937619377193781937919380193811938219383193841938519386193871938819389193901939119392193931939419395193961939719398193991940019401194021940319404194051940619407194081940919410194111941219413194141941519416194171941819419194201942119422194231942419425194261942719428194291943019431194321943319434194351943619437194381943919440194411944219443194441944519446194471944819449194501945119452194531945419455194561945719458194591946019461194621946319464194651946619467194681946919470194711947219473194741947519476194771947819479194801948119482194831948419485194861948719488194891949019491194921949319494194951949619497194981949919500195011950219503195041950519506195071950819509195101951119512195131951419515195161951719518195191952019521195221952319524195251952619527195281952919530195311953219533195341953519536195371953819539195401954119542195431954419545195461954719548195491955019551195521955319554195551955619557195581955919560195611956219563195641956519566195671956819569195701957119572195731957419575195761957719578195791958019581195821958319584195851958619587195881958919590195911959219593195941959519596195971959819599196001960119602196031960419605196061960719608196091961019611196121961319614196151961619617196181961919620196211962219623196241962519626196271962819629196301963119632196331963419635196361963719638196391964019641196421964319644196451964619647196481964919650196511965219653196541965519656196571965819659196601966119662196631966419665196661966719668196691967019671196721967319674196751967619677196781967919680196811968219683196841968519686196871968819689196901969119692196931969419695196961969719698196991970019701197021970319704197051970619707197081970919710197111971219713197141971519716197171971819719197201972119722197231972419725197261972719728197291973019731197321973319734197351973619737197381973919740197411974219743197441974519746197471974819749197501975119752197531975419755197561975719758197591976019761197621976319764197651976619767197681976919770197711977219773197741977519776197771977819779197801978119782197831978419785197861978719788197891979019791197921979319794197951979619797197981979919800198011980219803198041980519806198071980819809198101981119812198131981419815198161981719818198191982019821198221982319824198251982619827198281982919830198311983219833198341983519836198371983819839198401984119842198431984419845198461984719848198491985019851198521985319854198551985619857198581985919860198611986219863198641986519866198671986819869198701987119872198731987419875198761987719878198791988019881198821988319884198851988619887198881988919890198911989219893198941989519896198971989819899199001990119902199031990419905199061990719908199091991019911199121991319914199151991619917199181991919920199211992219923199241992519926199271992819929199301993119932199331993419935199361993719938199391994019941199421994319944199451994619947199481994919950199511995219953199541995519956199571995819959199601996119962199631996419965199661996719968199691997019971199721997319974199751997619977199781997919980199811998219983199841998519986199871998819989199901999119992199931999419995199961999719998199992000020001200022000320004200052000620007200082000920010200112001220013200142001520016200172001820019200202002120022200232002420025200262002720028200292003020031200322003320034200352003620037200382003920040200412004220043200442004520046200472004820049200502005120052200532005420055200562005720058200592006020061200622006320064200652006620067200682006920070200712007220073200742007520076200772007820079200802008120082200832008420085200862008720088200892009020091200922009320094200952009620097200982009920100201012010220103201042010520106201072010820109201102011120112201132011420115201162011720118201192012020121201222012320124201252012620127201282012920130201312013220133201342013520136201372013820139201402014120142201432014420145201462014720148201492015020151201522015320154201552015620157201582015920160201612016220163201642016520166201672016820169201702017120172201732017420175201762017720178201792018020181201822018320184201852018620187201882018920190201912019220193201942019520196201972019820199202002020120202202032020420205202062020720208202092021020211202122021320214202152021620217202182021920220202212022220223202242022520226202272022820229202302023120232202332023420235202362023720238202392024020241202422024320244202452024620247202482024920250202512025220253202542025520256202572025820259202602026120262202632026420265202662026720268202692027020271202722027320274202752027620277202782027920280202812028220283202842028520286202872028820289202902029120292202932029420295202962029720298202992030020301203022030320304203052030620307203082030920310203112031220313203142031520316203172031820319203202032120322203232032420325203262032720328203292033020331203322033320334203352033620337203382033920340203412034220343203442034520346203472034820349203502035120352203532035420355203562035720358203592036020361203622036320364203652036620367203682036920370203712037220373203742037520376203772037820379203802038120382203832038420385203862038720388203892039020391203922039320394203952039620397203982039920400204012040220403204042040520406204072040820409204102041120412204132041420415204162041720418204192042020421204222042320424204252042620427204282042920430204312043220433204342043520436204372043820439204402044120442204432044420445204462044720448204492045020451204522045320454204552045620457204582045920460204612046220463204642046520466204672046820469204702047120472204732047420475204762047720478204792048020481204822048320484204852048620487204882048920490204912049220493204942049520496204972049820499205002050120502205032050420505205062050720508205092051020511205122051320514205152051620517205182051920520205212052220523205242052520526205272052820529205302053120532205332053420535205362053720538205392054020541205422054320544205452054620547205482054920550205512055220553205542055520556205572055820559205602056120562205632056420565205662056720568205692057020571205722057320574205752057620577205782057920580205812058220583205842058520586205872058820589205902059120592205932059420595205962059720598205992060020601206022060320604206052060620607206082060920610206112061220613206142061520616206172061820619206202062120622206232062420625206262062720628206292063020631206322063320634206352063620637206382063920640206412064220643206442064520646206472064820649206502065120652206532065420655206562065720658206592066020661206622066320664206652066620667206682066920670206712067220673206742067520676206772067820679206802068120682206832068420685206862068720688206892069020691206922069320694206952069620697206982069920700207012070220703207042070520706207072070820709207102071120712207132071420715207162071720718207192072020721207222072320724207252072620727207282072920730207312073220733207342073520736207372073820739207402074120742207432074420745207462074720748207492075020751207522075320754207552075620757207582075920760207612076220763207642076520766207672076820769207702077120772207732077420775207762077720778207792078020781207822078320784207852078620787207882078920790207912079220793207942079520796207972079820799208002080120802208032080420805208062080720808208092081020811208122081320814208152081620817208182081920820208212082220823208242082520826208272082820829208302083120832208332083420835208362083720838208392084020841208422084320844208452084620847208482084920850208512085220853208542085520856208572085820859208602086120862208632086420865208662086720868208692087020871208722087320874208752087620877208782087920880208812088220883208842088520886208872088820889208902089120892208932089420895208962089720898208992090020901209022090320904209052090620907209082090920910209112091220913209142091520916209172091820919209202092120922209232092420925209262092720928209292093020931209322093320934209352093620937209382093920940209412094220943209442094520946209472094820949209502095120952209532095420955209562095720958209592096020961209622096320964209652096620967209682096920970209712097220973209742097520976209772097820979209802098120982209832098420985209862098720988209892099020991209922099320994209952099620997209982099921000210012100221003210042100521006210072100821009210102101121012210132101421015210162101721018210192102021021210222102321024210252102621027210282102921030210312103221033210342103521036210372103821039210402104121042210432104421045210462104721048210492105021051210522105321054210552105621057210582105921060210612106221063210642106521066210672106821069210702107121072210732107421075210762107721078210792108021081210822108321084210852108621087210882108921090210912109221093210942109521096210972109821099211002110121102211032110421105211062110721108211092111021111211122111321114211152111621117211182111921120211212112221123211242112521126211272112821129211302113121132211332113421135211362113721138211392114021141211422114321144211452114621147211482114921150211512115221153211542115521156211572115821159211602116121162211632116421165211662116721168211692117021171211722117321174211752117621177211782117921180211812118221183211842118521186211872118821189211902119121192211932119421195211962119721198211992120021201212022120321204212052120621207212082120921210212112121221213212142121521216212172121821219212202122121222212232122421225212262122721228212292123021231212322123321234212352123621237212382123921240212412124221243212442124521246212472124821249212502125121252212532125421255212562125721258212592126021261212622126321264212652126621267212682126921270212712127221273212742127521276212772127821279212802128121282212832128421285212862128721288212892129021291212922129321294212952129621297212982129921300213012130221303213042130521306213072130821309213102131121312213132131421315213162131721318213192132021321213222132321324213252132621327213282132921330213312133221333213342133521336213372133821339213402134121342213432134421345213462134721348213492135021351213522135321354213552135621357213582135921360213612136221363213642136521366213672136821369213702137121372213732137421375213762137721378213792138021381213822138321384213852138621387213882138921390213912139221393213942139521396213972139821399214002140121402214032140421405214062140721408214092141021411214122141321414214152141621417214182141921420214212142221423214242142521426214272142821429214302143121432214332143421435214362143721438214392144021441214422144321444214452144621447214482144921450214512145221453214542145521456214572145821459214602146121462214632146421465214662146721468214692147021471214722147321474214752147621477214782147921480214812148221483214842148521486214872148821489214902149121492214932149421495214962149721498214992150021501215022150321504215052150621507215082150921510215112151221513215142151521516215172151821519215202152121522215232152421525215262152721528215292153021531215322153321534215352153621537215382153921540215412154221543215442154521546215472154821549215502155121552215532155421555215562155721558215592156021561215622156321564215652156621567215682156921570215712157221573215742157521576215772157821579215802158121582215832158421585215862158721588215892159021591215922159321594215952159621597215982159921600216012160221603216042160521606216072160821609216102161121612216132161421615216162161721618216192162021621216222162321624216252162621627216282162921630216312163221633216342163521636216372163821639216402164121642216432164421645216462164721648216492165021651216522165321654216552165621657216582165921660216612166221663216642166521666216672166821669216702167121672216732167421675216762167721678216792168021681216822168321684216852168621687216882168921690216912169221693216942169521696216972169821699217002170121702217032170421705217062170721708217092171021711217122171321714217152171621717217182171921720217212172221723217242172521726217272172821729217302173121732217332173421735217362173721738217392174021741217422174321744217452174621747217482174921750217512175221753217542175521756217572175821759217602176121762217632176421765217662176721768217692177021771217722177321774217752177621777217782177921780217812178221783217842178521786217872178821789217902179121792217932179421795217962179721798217992180021801218022180321804218052180621807218082180921810218112181221813218142181521816218172181821819218202182121822218232182421825218262182721828218292183021831218322183321834218352183621837218382183921840218412184221843218442184521846218472184821849218502185121852218532185421855218562185721858218592186021861218622186321864218652186621867218682186921870218712187221873218742187521876218772187821879218802188121882218832188421885218862188721888218892189021891218922189321894218952189621897218982189921900219012190221903219042190521906219072190821909219102191121912219132191421915219162191721918219192192021921219222192321924219252192621927219282192921930219312193221933219342193521936219372193821939219402194121942219432194421945219462194721948219492195021951219522195321954219552195621957219582195921960219612196221963219642196521966219672196821969219702197121972219732197421975219762197721978219792198021981219822198321984219852198621987219882198921990219912199221993219942199521996219972199821999220002200122002220032200422005220062200722008220092201022011220122201322014220152201622017220182201922020220212202222023220242202522026220272202822029220302203122032220332203422035220362203722038220392204022041220422204322044220452204622047220482204922050220512205222053220542205522056220572205822059220602206122062220632206422065220662206722068220692207022071220722207322074220752207622077220782207922080220812208222083220842208522086220872208822089220902209122092220932209422095220962209722098220992210022101221022210322104221052210622107221082210922110221112211222113221142211522116221172211822119221202212122122221232212422125221262212722128221292213022131221322213322134221352213622137221382213922140221412214222143221442214522146221472214822149221502215122152221532215422155221562215722158221592216022161221622216322164221652216622167221682216922170221712217222173221742217522176221772217822179221802218122182221832218422185221862218722188221892219022191221922219322194221952219622197221982219922200222012220222203222042220522206222072220822209222102221122212222132221422215222162221722218222192222022221222222222322224222252222622227222282222922230222312223222233222342223522236222372223822239222402224122242222432224422245222462224722248222492225022251222522225322254222552225622257222582225922260222612226222263222642226522266222672226822269222702227122272222732227422275222762227722278222792228022281222822228322284222852228622287222882228922290222912229222293222942229522296222972229822299223002230122302223032230422305223062230722308223092231022311223122231322314223152231622317223182231922320223212232222323223242232522326223272232822329223302233122332223332233422335223362233722338223392234022341223422234322344223452234622347223482234922350223512235222353223542235522356223572235822359223602236122362223632236422365223662236722368223692237022371223722237322374223752237622377223782237922380223812238222383223842238522386223872238822389223902239122392223932239422395223962239722398223992240022401224022240322404224052240622407224082240922410224112241222413224142241522416224172241822419224202242122422224232242422425224262242722428224292243022431224322243322434224352243622437224382243922440224412244222443224442244522446224472244822449224502245122452224532245422455224562245722458224592246022461224622246322464224652246622467224682246922470224712247222473224742247522476224772247822479224802248122482224832248422485224862248722488224892249022491224922249322494224952249622497224982249922500225012250222503225042250522506225072250822509225102251122512225132251422515225162251722518225192252022521225222252322524225252252622527225282252922530225312253222533225342253522536225372253822539225402254122542225432254422545225462254722548225492255022551225522255322554225552255622557225582255922560225612256222563225642256522566225672256822569225702257122572225732257422575225762257722578225792258022581225822258322584225852258622587225882258922590225912259222593225942259522596225972259822599226002260122602226032260422605226062260722608226092261022611226122261322614226152261622617226182261922620226212262222623226242262522626226272262822629226302263122632226332263422635226362263722638226392264022641226422264322644226452264622647226482264922650226512265222653226542265522656226572265822659226602266122662226632266422665226662266722668226692267022671226722267322674226752267622677226782267922680226812268222683226842268522686226872268822689226902269122692226932269422695226962269722698226992270022701227022270322704227052270622707227082270922710227112271222713227142271522716227172271822719227202272122722227232272422725227262272722728227292273022731227322273322734227352273622737227382273922740227412274222743227442274522746227472274822749227502275122752227532275422755227562275722758227592276022761227622276322764227652276622767227682276922770227712277222773227742277522776227772277822779227802278122782227832278422785227862278722788227892279022791227922279322794227952279622797227982279922800228012280222803228042280522806228072280822809228102281122812228132281422815228162281722818228192282022821228222282322824228252282622827228282282922830228312283222833228342283522836228372283822839228402284122842228432284422845228462284722848228492285022851228522285322854228552285622857228582285922860228612286222863228642286522866228672286822869228702287122872228732287422875228762287722878228792288022881228822288322884228852288622887228882288922890228912289222893228942289522896228972289822899229002290122902229032290422905229062290722908229092291022911229122291322914229152291622917229182291922920229212292222923229242292522926229272292822929229302293122932229332293422935229362293722938229392294022941229422294322944229452294622947229482294922950229512295222953229542295522956229572295822959229602296122962229632296422965229662296722968229692297022971229722297322974229752297622977229782297922980229812298222983229842298522986229872298822989229902299122992229932299422995229962299722998229992300023001230022300323004230052300623007230082300923010230112301223013230142301523016230172301823019230202302123022230232302423025 |
- #ifndef VC_VECTOR_H_
- #define VC_VECTOR_H_
- #ifndef VC_SCALAR_VECTOR_H_
- #define VC_SCALAR_VECTOR_H_
- #include <assert.h>
- #include <algorithm>
- #include <cmath>
- #ifdef _MSC_VER
- #include <float.h>
- #endif
- #ifndef VC_COMMON_TYPES_H_
- #define VC_COMMON_TYPES_H_
- #ifdef Vc_CHECK_ALIGNMENT
- #include <cstdlib>
- #include <cstdio>
- #endif
- #include <ratio>
- #ifndef VC_GLOBAL_H_
- #define VC_GLOBAL_H_
- #include <cstdint>
- #ifndef VC_FWDDECL_H_
- #define VC_FWDDECL_H_
- #include <cstddef>
- #define Vc_VERSIONED_NAMESPACE Vc_1
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace VectorAbi
- {
- struct Scalar {};
- struct Sse {};
- struct Avx {};
- struct Mic {};
- template <class T> struct DeduceCompatible;
- template <class T> struct DeduceBest;
- }
- namespace Common
- {
- template <class T, std::size_t N> struct select_best_vector_type;
- }
- template <class T, class Abi> class Mask;
- template <class T, class Abi> class Vector;
- template <class T, std::size_t N,
- class V = typename Common::select_best_vector_type<T, N>::type,
- std::size_t Wt = V::Size>
- class SimdArray;
- template <class T, std::size_t N,
- class V = typename Common::select_best_vector_type<T, N>::type,
- std::size_t Wt = V::Size>
- class SimdMaskArray;
- namespace simd_abi
- {
- using scalar = VectorAbi::Scalar;
- template <int N> struct fixed_size;
- template <class T> using compatible = typename VectorAbi::DeduceCompatible<T>::type;
- template <class T> using native = typename VectorAbi::DeduceBest<T>::type;
- using __sse = VectorAbi::Sse;
- using __avx = VectorAbi::Avx;
- struct __avx512;
- struct __neon;
- }
- template <class T, class Abi = simd_abi::compatible<T>> using simd = Vector<T, Abi>;
- template <class T, class Abi = simd_abi::compatible<T>> using simd_mask = Mask<T, Abi>;
- template <class T> using native_simd = simd<T, simd_abi::native<T>>;
- template <class T> using native_simd_mask = simd_mask<T, simd_abi::native<T>>;
- template <class T, int N> using fixed_size_simd = simd<T, simd_abi::fixed_size<N>>;
- template <class T, int N>
- using fixed_size_simd_mask = simd_mask<T, simd_abi::fixed_size<N>>;
- }
- #ifndef DOXYGEN
- namespace Vc = Vc_VERSIONED_NAMESPACE;
- #endif
- #endif
- #ifdef DOXYGEN
- #define Vc_ICC __INTEL_COMPILER_BUILD_DATE
- #undef Vc_ICC
- #define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
- #undef Vc_CLANG
- #define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
- #undef Vc_APPLECLANG
- #define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
- #define Vc_MSVC _MSC_FULL_VER
- #undef Vc_MSVC
- #else
- #ifdef __INTEL_COMPILER
- #define Vc_ICC __INTEL_COMPILER_BUILD_DATE
- #elif defined(__clang__) && defined(__apple_build_version__)
- #define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
- #elif defined(__clang__)
- #define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
- #elif defined(__GNUC__)
- #define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
- #elif defined(_MSC_VER)
- #define Vc_MSVC _MSC_FULL_VER
- #else
- #define Vc_UNSUPPORTED_COMPILER 1
- #endif
- #if defined Vc_GCC && Vc_GCC >= 0x60000
- #define Vc_RESET_DIAGNOSTICS _Pragma("GCC diagnostic pop")
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wignored-attributes"
- #else
- #define Vc_RESET_DIAGNOSTICS
- #endif
- #if defined Vc_ICC
- #pragma warning disable 2922
- #endif
- #if __cplusplus < 201103 && (!defined Vc_MSVC || _MSC_VER < 1900)
- # error "Vc requires support for C++11."
- #elif __cplusplus >= 201402L
- #define Vc_CXX14 1
- # if __cplusplus > 201700L
- #define Vc_CXX17 1
- # endif
- #endif
- #if defined(__GNUC__) && !defined(Vc_NO_INLINE_ASM)
- #define Vc_GNU_ASM 1
- #endif
- #ifdef Vc_GCC
- #define Vc_HAVE_MAX_ALIGN_T 1
- #elif !defined(Vc_CLANG) && !defined(Vc_ICC)
- #define Vc_HAVE_STD_MAX_ALIGN_T 1
- #endif
- #if defined(Vc_GCC) || defined(Vc_CLANG) || defined Vc_APPLECLANG
- #define Vc_USE_BUILTIN_VECTOR_TYPES 1
- #endif
- #ifdef Vc_MSVC
- #define Vc_CDECL __cdecl
- #define Vc_VDECL __vectorcall
- #else
- #define Vc_CDECL
- #define Vc_VDECL
- #endif
- #define Scalar 0x00100000
- #define SSE 0x00200000
- #define SSE2 0x00300000
- #define SSE3 0x00400000
- #define SSSE3 0x00500000
- #define SSE4_1 0x00600000
- #define SSE4_2 0x00700000
- #define AVX 0x00800000
- #define AVX2 0x00900000
- #define XOP 0x00000001
- #define FMA4 0x00000002
- #define F16C 0x00000004
- #define POPCNT 0x00000008
- #define SSE4a 0x00000010
- #define FMA 0x00000020
- #define BMI2 0x00000040
- #define IMPL_MASK 0xFFF00000
- #define EXT_MASK 0x000FFFFF
- #ifdef Vc_MSVC
- # ifdef _M_IX86_FP
- # if _M_IX86_FP >= 1
- # ifndef __SSE__
- #define __SSE__ 1
- # endif
- # endif
- # if _M_IX86_FP >= 2
- # ifndef __SSE2__
- #define __SSE2__ 1
- # endif
- # endif
- # elif defined(_M_AMD64)
- # ifndef __SSE__
- #define __SSE__ 1
- # endif
- # ifndef __SSE2__
- #define __SSE2__ 1
- # endif
- # endif
- #endif
- #if defined Vc_ICC && !defined __POPCNT__
- # if defined __SSE4_2__ || defined __SSE4A__
- #define __POPCNT__ 1
- # endif
- #endif
- #ifdef VC_IMPL
- #error "You are using the old VC_IMPL macro. Since Vc 1.0 all Vc macros start with Vc_, i.e. a lower-case 'c'"
- #endif
- #ifndef Vc_IMPL
- # if defined(__AVX2__)
- #define Vc_IMPL_AVX2 1
- #define Vc_IMPL_AVX 1
- # elif defined(__AVX__)
- #define Vc_IMPL_AVX 1
- # else
- # if defined(__SSE4_2__)
- #define Vc_IMPL_SSE 1
- #define Vc_IMPL_SSE4_2 1
- # endif
- # if defined(__SSE4_1__)
- #define Vc_IMPL_SSE 1
- #define Vc_IMPL_SSE4_1 1
- # endif
- # if defined(__SSE3__)
- #define Vc_IMPL_SSE 1
- #define Vc_IMPL_SSE3 1
- # endif
- # if defined(__SSSE3__)
- #define Vc_IMPL_SSE 1
- #define Vc_IMPL_SSSE3 1
- # endif
- # if defined(__SSE2__)
- #define Vc_IMPL_SSE 1
- #define Vc_IMPL_SSE2 1
- # endif
- # if defined(Vc_IMPL_SSE)
- # else
- #define Vc_IMPL_Scalar 1
- # endif
- # endif
- # if !defined(Vc_IMPL_Scalar)
- # ifdef __FMA4__
- #define Vc_IMPL_FMA4 1
- # endif
- # ifdef __XOP__
- #define Vc_IMPL_XOP 1
- # endif
- # ifdef __F16C__
- #define Vc_IMPL_F16C 1
- # endif
- # ifdef __POPCNT__
- #define Vc_IMPL_POPCNT 1
- # endif
- # ifdef __SSE4A__
- #define Vc_IMPL_SSE4a 1
- # endif
- # ifdef __FMA__
- #define Vc_IMPL_FMA 1
- # endif
- # ifdef __BMI2__
- #define Vc_IMPL_BMI2 1
- # endif
- # endif
- #else
- # if (Vc_IMPL & IMPL_MASK) == AVX2
- #define Vc_IMPL_AVX2 1
- #define Vc_IMPL_AVX 1
- # elif (Vc_IMPL & IMPL_MASK) == AVX
- #define Vc_IMPL_AVX 1
- # elif (Vc_IMPL & IMPL_MASK) == Scalar
- #define Vc_IMPL_Scalar 1
- # elif (Vc_IMPL & IMPL_MASK) == SSE4_2
- #define Vc_IMPL_SSE4_2 1
- #define Vc_IMPL_SSE4_1 1
- #define Vc_IMPL_SSSE3 1
- #define Vc_IMPL_SSE3 1
- #define Vc_IMPL_SSE2 1
- #define Vc_IMPL_SSE 1
- # elif (Vc_IMPL & IMPL_MASK) == SSE4_1
- #define Vc_IMPL_SSE4_1 1
- #define Vc_IMPL_SSSE3 1
- #define Vc_IMPL_SSE3 1
- #define Vc_IMPL_SSE2 1
- #define Vc_IMPL_SSE 1
- # elif (Vc_IMPL & IMPL_MASK) == SSSE3
- #define Vc_IMPL_SSSE3 1
- #define Vc_IMPL_SSE3 1
- #define Vc_IMPL_SSE2 1
- #define Vc_IMPL_SSE 1
- # elif (Vc_IMPL & IMPL_MASK) == SSE3
- #define Vc_IMPL_SSE3 1
- #define Vc_IMPL_SSE2 1
- #define Vc_IMPL_SSE 1
- # elif (Vc_IMPL & IMPL_MASK) == SSE2
- #define Vc_IMPL_SSE2 1
- #define Vc_IMPL_SSE 1
- # elif (Vc_IMPL & IMPL_MASK) == SSE
- #define Vc_IMPL_SSE 1
- # if defined(__SSE4_2__)
- #define Vc_IMPL_SSE4_2 1
- # endif
- # if defined(__SSE4_1__)
- #define Vc_IMPL_SSE4_1 1
- # endif
- # if defined(__SSE3__)
- #define Vc_IMPL_SSE3 1
- # endif
- # if defined(__SSSE3__)
- #define Vc_IMPL_SSSE3 1
- # endif
- # if defined(__SSE2__)
- #define Vc_IMPL_SSE2 1
- # endif
- # elif (Vc_IMPL & IMPL_MASK) == 0 && (Vc_IMPL & SSE4a)
- #define Vc_IMPL_SSE3 1
- #define Vc_IMPL_SSE2 1
- #define Vc_IMPL_SSE 1
- # endif
- # if (Vc_IMPL & XOP)
- #define Vc_IMPL_XOP 1
- # endif
- # if (Vc_IMPL & FMA4)
- #define Vc_IMPL_FMA4 1
- # endif
- # if (Vc_IMPL & F16C)
- #define Vc_IMPL_F16C 1
- # endif
- # if (!defined(Vc_IMPL_Scalar) && defined(__POPCNT__)) || (Vc_IMPL & POPCNT)
- #define Vc_IMPL_POPCNT 1
- # endif
- # if (Vc_IMPL & SSE4a)
- #define Vc_IMPL_SSE4a 1
- # endif
- # if (Vc_IMPL & FMA)
- #define Vc_IMPL_FMA 1
- # endif
- # if (Vc_IMPL & BMI2)
- #define Vc_IMPL_BMI2 1
- # endif
- #undef Vc_IMPL
- #endif
- #ifdef __AVX__
- #define Vc_USE_VEX_CODING 1
- #endif
- #ifdef Vc_IMPL_AVX
- #define Vc_IMPL_SSE4_2 1
- #define Vc_IMPL_SSE4_1 1
- #define Vc_IMPL_SSSE3 1
- #define Vc_IMPL_SSE3 1
- #define Vc_IMPL_SSE2 1
- #define Vc_IMPL_SSE 1
- #endif
- #if defined(Vc_CLANG) && Vc_CLANG >= 0x30600 && Vc_CLANG < 0x30700
- # if defined(Vc_IMPL_AVX)
- # warning "clang 3.6.x miscompiles AVX code, frequently losing 50% of the data. Vc will fall back to SSE4 instead."
- #undef Vc_IMPL_AVX
- # if defined(Vc_IMPL_AVX2)
- #undef Vc_IMPL_AVX2
- # endif
- # endif
- #endif
- # if !defined(Vc_IMPL_Scalar) && !defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_AVX)
- # error "No suitable Vc implementation was selected! Probably Vc_IMPL was set to an invalid value."
- # elif defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_SSE2)
- # error "SSE requested but no SSE2 support. Vc needs at least SSE2!"
- # endif
- #undef Scalar
- #undef SSE
- #undef SSE2
- #undef SSE3
- #undef SSSE3
- #undef SSE4_1
- #undef SSE4_2
- #undef AVX
- #undef AVX2
- #undef XOP
- #undef FMA4
- #undef F16C
- #undef POPCNT
- #undef SSE4a
- #undef FMA
- #undef BMI2
- #undef IMPL_MASK
- #undef EXT_MASK
- #if defined Vc_IMPL_AVX2
- #define Vc_DEFAULT_IMPL_AVX2
- #elif defined Vc_IMPL_AVX
- #define Vc_DEFAULT_IMPL_AVX
- #elif defined Vc_IMPL_SSE
- #define Vc_DEFAULT_IMPL_SSE
- #elif defined Vc_IMPL_Scalar
- #define Vc_DEFAULT_IMPL_Scalar
- #else
- #error "Preprocessor logic broken. Please report a bug."
- #endif
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- typedef signed char int8_t;
- typedef unsigned char uint8_t;
- typedef signed short int16_t;
- typedef unsigned short uint16_t;
- typedef signed int int32_t;
- typedef unsigned int uint32_t;
- typedef signed long long int64_t;
- typedef unsigned long long uint64_t;
- enum MallocAlignment {
- AlignOnVector,
- AlignOnCacheline,
- AlignOnPage
- };
- enum Implementation : std::uint_least32_t {
- ScalarImpl,
- SSE2Impl,
- SSE3Impl,
- SSSE3Impl,
- SSE41Impl,
- SSE42Impl,
- AVXImpl,
- AVX2Impl,
- MICImpl,
- ImplementationMask = 0xfff
- };
- enum ExtraInstructions : std::uint_least32_t {
- Float16cInstructions = 0x01000,
- Fma4Instructions = 0x02000,
- XopInstructions = 0x04000,
- PopcntInstructions = 0x08000,
- Sse4aInstructions = 0x10000,
- FmaInstructions = 0x20000,
- VexInstructions = 0x40000,
- Bmi2Instructions = 0x80000,
- ExtraInstructionsMask = 0xfffff000u
- };
- template <unsigned int Features> struct ImplementationT {
- static constexpr Implementation current()
- {
- return static_cast<Implementation>(Features & ImplementationMask);
- }
- static constexpr bool is(Implementation impl)
- {
- return static_cast<unsigned int>(impl) == current();
- }
- static constexpr bool is_between(Implementation low, Implementation high)
- {
- return static_cast<unsigned int>(low) <= current() &&
- static_cast<unsigned int>(high) >= current();
- }
- static constexpr bool runs_on(unsigned int extraInstructions)
- {
- return (extraInstructions & Features & ExtraInstructionsMask) ==
- (Features & ExtraInstructionsMask);
- }
- };
- using CurrentImplementation = ImplementationT<
- #ifdef Vc_IMPL_Scalar
- ScalarImpl
- #elif defined(Vc_IMPL_AVX2)
- AVX2Impl
- #elif defined(Vc_IMPL_AVX)
- AVXImpl
- #elif defined(Vc_IMPL_SSE4_2)
- SSE42Impl
- #elif defined(Vc_IMPL_SSE4_1)
- SSE41Impl
- #elif defined(Vc_IMPL_SSSE3)
- SSSE3Impl
- #elif defined(Vc_IMPL_SSE3)
- SSE3Impl
- #elif defined(Vc_IMPL_SSE2)
- SSE2Impl
- #endif
- #ifdef Vc_IMPL_SSE4a
- + Vc::Sse4aInstructions
- #ifdef Vc_IMPL_XOP
- + Vc::XopInstructions
- #ifdef Vc_IMPL_FMA4
- + Vc::Fma4Instructions
- #endif
- #endif
- #endif
- #ifdef Vc_IMPL_POPCNT
- + Vc::PopcntInstructions
- #endif
- #ifdef Vc_IMPL_FMA
- + Vc::FmaInstructions
- #endif
- #ifdef Vc_IMPL_BMI2
- + Vc::Bmi2Instructions
- #endif
- #ifdef Vc_USE_VEX_CODING
- + Vc::VexInstructions
- #endif
- >;
- }
- #ifndef VC_VERSION_H_
- #define VC_VERSION_H_
- #define Vc_VERSION_STRING "1.4.1-dev"
- #define Vc_VERSION_NUMBER 0x010403
- #define Vc_VERSION_CHECK(major,minor,patch) ((major << 16) | (minor << 8) | (patch << 1))
- #define Vc_LIBRARY_ABI_VERSION 5
- #define Vc_IS_VERSION_2 (Vc_VERSION_NUMBER >= Vc_VERSION_CHECK(1, 70, 0))
- #define Vc_IS_VERSION_1 (Vc_VERSION_NUMBER < Vc_VERSION_CHECK(1, 70, 0))
- namespace Vc_VERSIONED_NAMESPACE
- {
- inline const char *versionString() { return Vc_VERSION_STRING; }
- constexpr unsigned int versionNumber() { return Vc_VERSION_NUMBER; }
- }
- #endif
- #endif
- #ifndef VC_TRAITS_TYPE_TRAITS_H_
- #define VC_TRAITS_TYPE_TRAITS_H_
- #include <type_traits>
- #ifndef VC_TRAITS_DECAY_H_
- #define VC_TRAITS_DECAY_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Traits
- {
- template <typename T> using decay = typename std::decay<T>::type;
- }
- }
- #endif
- #ifndef VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_
- #define VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_
- #include <array>
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Traits
- {
- template<typename T> struct has_no_allocated_data_impl : public std::false_type {};
- template <typename T>
- struct has_no_allocated_data
- : public has_no_allocated_data_impl<
- typename std::remove_cv<typename std::remove_reference<T>::type>::type>
- {
- };
- template<typename T, std::size_t N> struct has_no_allocated_data_impl<std::array<T, N>> : public std::true_type {};
- template<typename T, std::size_t N> struct has_no_allocated_data_impl<T[N]> : public std::true_type {};
- template<typename T> struct has_no_allocated_data_impl<T[]> : public std::true_type {};
- }
- }
- #endif
- #ifndef VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_
- #define VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_
- #include <initializer_list>
- #include <memory>
- #ifdef _LIBCPP_BEGIN_NAMESPACE_STD
- _LIBCPP_BEGIN_NAMESPACE_STD
- #else
- namespace std
- {
- #endif
- #ifdef _WIN32
- template <typename T, size_t N> class array;
- #else
- template <typename T, size_t N> struct array;
- #endif
- template <typename T, typename Allocator> class vector;
- #ifdef _LIBCPP_END_NAMESPACE_STD
- _LIBCPP_END_NAMESPACE_STD
- #else
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Traits
- {
- namespace has_contiguous_storage_detail
- {
- template <typename T, typename It = typename T::iterator>
- std::is_base_of<std::random_access_iterator_tag,
- typename std::iterator_traits<It>::iterator_category>
- test(int);
- template <typename T>
- std::is_base_of<std::random_access_iterator_tag,
- typename std::iterator_traits<T>::iterator_category>
- test(long);
- template <typename T> std::false_type test(...);
- }
- template <typename T>
- struct has_contiguous_storage_impl
- : public decltype(has_contiguous_storage_detail::test<T>(int())) {
- };
- template <typename T>
- struct has_contiguous_storage
- : public has_contiguous_storage_impl<
- typename std::remove_cv<typename std::remove_reference<T>::type>::type>
- {
- };
- template <typename T> struct has_contiguous_storage_impl<const T *> : public std::true_type {};
- template <typename T> struct has_contiguous_storage_impl<T *> : public std::true_type {};
- template <typename T> struct has_contiguous_storage_impl<std::unique_ptr<T[]>> : public std::true_type {};
- template <typename T> struct has_contiguous_storage_impl<std::initializer_list<T>> : public std::true_type {};
- template <typename T, std::size_t N> struct has_contiguous_storage_impl<T[N]> : public std::true_type {};
- template <typename T, std::size_t N> struct has_contiguous_storage_impl<std::array<T, N>> : public std::true_type {};
- template <typename T, typename A> struct has_contiguous_storage_impl<std::vector<T, A>> : public std::true_type {};
- }
- }
- #endif
- #ifndef VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_
- #define VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Traits
- {
- namespace is_functor_argument_immutable_impl
- {
- template <typename F, typename A> std::true_type test(void (F::*)(A));
- template <typename F, typename A> std::true_type test(void (F::*)(A) const);
- template <typename F, typename A> std::is_const<A> test(void (F::*)(A &));
- template <typename F, typename A> std::is_const<A> test(void (F::*)(A &) const);
- template <typename F, typename A> std::is_const<A> test(void (F::*)(A &&));
- template <typename F, typename A> std::is_const<A> test(void (F::*)(A &&) const);
- struct dummy {};
- template <
- typename F, typename A,
- #ifdef Vc_MSVC
- #define Vc_TEMPLATE_
- #else
- #define Vc_TEMPLATE_ template
- #endif
- typename MemberPtr = decltype(&F::Vc_TEMPLATE_ operator()<A>)>
- decltype(is_functor_argument_immutable_impl::test(std::declval<MemberPtr>())) test2(int);
- #undef Vc_TEMPLATE_
- template <typename F, typename A>
- decltype(
- is_functor_argument_immutable_impl::test(std::declval<decltype(&F::operator())>()))
- test2(float);
- template <typename A> std::true_type test3(void(*)(A));
- template <typename A> std::is_const<A> test3(void(*)(A &));
- template <typename A> std::is_const<A> test3(void(*)(A &&));
- }
- template <typename F, typename A, bool = std::is_function<F>::value>
- struct is_functor_argument_immutable;
- template <typename F, typename A>
- struct is_functor_argument_immutable<F, A, false>
- : decltype(is_functor_argument_immutable_impl::test2<
- typename std::remove_reference<F>::type, A>(int())) {
- };
- template <typename F, typename A>
- struct is_functor_argument_immutable<F, A, true>
- : decltype(is_functor_argument_immutable_impl::test3(std::declval<F>())) {
- };
- }
- }
- #endif
- #ifndef VC_TRAITS_IS_OUTPUT_ITERATOR_H_
- #define VC_TRAITS_IS_OUTPUT_ITERATOR_H_
- #include <iterator>
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Traits
- {
- namespace is_output_iterator_impl
- {
- template <typename T, typename ValueType = typename std::iterator_traits<T>::value_type,
- typename = decltype(*std::declval<T &>() = std::declval<
- ValueType>())
- >
- std::true_type test(int);
- template <typename T> std::false_type test(...);
- }
- template <typename T>
- struct is_output_iterator
- : public std::conditional<
- std::is_void<typename std::iterator_traits<T>::value_type>::value,
- std::true_type, decltype(is_output_iterator_impl::test<T>(int()))>::type
- {
- };
- static_assert(!std::is_void<std::iterator_traits<int *>::value_type>::value, "");
- static_assert(is_output_iterator<int *>::value, "");
- static_assert(!is_output_iterator<const int *>::value, "");
- }
- }
- #endif
- #ifndef VC_IS_INDEX_SEQUENCE_H_
- #define VC_IS_INDEX_SEQUENCE_H_
- #ifndef VC_COMMON_INDEXSEQUENCE_H_
- #define VC_COMMON_INDEXSEQUENCE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <std::size_t... I> struct index_sequence
- {
- static constexpr std::size_t size() noexcept { return sizeof...(I); }
- };
- template <std::size_t N> struct make_index_sequence_impl {
- template <std::size_t Offset, std::size_t... Ns>
- static index_sequence<Ns..., (Ns + Offset)...> join(std::false_type,
- index_sequence<Ns...>);
- template <std::size_t Offset, std::size_t... Ns>
- static index_sequence<Ns..., Offset - 1, (Ns + Offset)...> join(
- std::true_type, index_sequence<Ns...>);
- using is_odd = std::integral_constant<bool, N & 1>;
- using half = typename make_index_sequence_impl<N / 2>::type;
- using type = decltype(join<(N + 1) / 2>(is_odd(), half()));
- };
- template <> struct make_index_sequence_impl<0> {
- using type = index_sequence<>;
- };
- template <> struct make_index_sequence_impl<1> {
- using type = index_sequence<0>;
- };
- template <> struct make_index_sequence_impl<2> {
- using type = index_sequence<0, 1>;
- };
- template <std::size_t N>
- using make_index_sequence = typename make_index_sequence_impl<N>::type;
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Traits
- {
- template <typename T> struct is_index_sequence : public std::false_type {};
- template <std::size_t... I>
- struct is_index_sequence<Vc::index_sequence<I...>> : public std::true_type {};
- static_assert(!is_index_sequence<int>::value, "");
- static_assert(is_index_sequence<make_index_sequence<2>>::value, "");
- }
- }
- #endif
- #ifndef VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_
- #define VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Traits
- {
- template <typename From, typename To, bool = std::is_integral<From>::value>
- struct is_implicit_cast_allowed
- : public std::integral_constant<
- bool, std::is_same<From, To>::value ||
- (std::is_integral<To>::value &&
- (std::is_same<typename std::make_unsigned<From>::type, To>::value ||
- std::is_same<typename std::make_signed<From>::type, To>::value))> {
- };
- template <typename From, typename To>
- struct is_implicit_cast_allowed<From, To, false> : public std::is_same<From, To>::type {
- };
- template <typename From, typename To>
- struct is_implicit_cast_allowed_mask : public is_implicit_cast_allowed<From, To> {
- };
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- struct enable_if_default_type
- {
- constexpr enable_if_default_type() {}
- };
- static constexpr enable_if_default_type nullarg;
- template <bool Test, typename T = enable_if_default_type> using enable_if = typename std::enable_if<Test, T>::type;
- template <bool B, class T, class F>
- using conditional_t = typename std::conditional<B, T, F>::type;
- template <class T>
- using remove_cvref_t =
- typename std::remove_cv<typename std::remove_reference<T>::type>::type;
- namespace Traits
- {
- #ifndef VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_
- #define VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_
- namespace has_subscript_operator_impl
- {
- template <typename T, typename I, typename = decltype(std::declval<T &>()[std::declval<I>()])> std::true_type test(int);
- template <typename T, typename I> std::false_type test(float);
- }
- template <typename T, typename I = std::size_t>
- struct has_subscript_operator : public decltype(has_subscript_operator_impl::test<T, I>(1))
- {
- };
- #endif
- #ifndef VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_
- #define VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_
- namespace has_multiply_operator_impl
- {
- template <typename T, typename U, typename = decltype(std::declval<T>() * std::declval<U>())> std::true_type test(int);
- template <typename T, typename U> std::false_type test(...);
- }
- template <typename T, typename U = T>
- struct has_multiply_operator : public decltype(has_multiply_operator_impl::test<T, U>(1))
- {
- };
- #endif
- #ifndef VC_TRAITS_HAS_ADDITION_OPERATOR_H_
- #define VC_TRAITS_HAS_ADDITION_OPERATOR_H_
- namespace has_addition_operator_impl
- {
- template <typename T, typename U, typename = decltype(std::declval<T>() + std::declval<U>())> std::true_type test(int);
- template <typename T, typename U> std::false_type test(...);
- }
- template <typename T, typename U = T>
- struct has_addition_operator : public decltype(has_addition_operator_impl::test<T, U>(1))
- {
- };
- #endif
- #ifndef VC_TRAITS_HAS_EQUALITY_OPERATOR_H_
- #define VC_TRAITS_HAS_EQUALITY_OPERATOR_H_
- namespace has_equality_operator_impl
- {
- template <typename T, typename U,
- typename = enable_if<!std::is_same<void, decltype(std::declval<T>() == std::declval<U>())>::value>>
- std::true_type test(int);
- template <typename T, typename U> std::false_type test(...);
- }
- template <typename T, typename U = T>
- struct has_equality_operator : public decltype(has_equality_operator_impl::test<T, U>(1))
- {
- };
- #endif
- template<typename T> struct is_valid_vector_argument : public std::false_type {};
- template <> struct is_valid_vector_argument<double> : public std::true_type {};
- template <> struct is_valid_vector_argument<float> : public std::true_type {};
- template <> struct is_valid_vector_argument<int> : public std::true_type {};
- template <> struct is_valid_vector_argument<unsigned int> : public std::true_type {};
- template <> struct is_valid_vector_argument<short> : public std::true_type {};
- template <> struct is_valid_vector_argument<unsigned short> : public std::true_type {};
- template<typename T> struct is_simd_mask_internal : public std::false_type {};
- template<typename T> struct is_simd_vector_internal : public std::false_type {};
- template<typename T> struct is_simdarray_internal : public std::false_type {};
- template<typename T> struct is_simd_mask_array_internal : public std::false_type {};
- template<typename T> struct is_loadstoreflag_internal : public std::false_type {};
- template <typename T, bool = is_simd_vector_internal<T>::value> struct is_integral_internal;
- template <typename T, bool = is_simd_vector_internal<T>::value> struct is_floating_point_internal;
- template <typename T, bool = is_simd_vector_internal<T>::value> struct is_signed_internal;
- template <typename T, bool = is_simd_vector_internal<T>::value> struct is_unsigned_internal;
- template <typename T> struct is_integral_internal <T, false> : public std::is_integral <T> {};
- template <typename T> struct is_floating_point_internal<T, false> : public std::is_floating_point<T> {};
- template <typename T> struct is_signed_internal <T, false> : public std::is_signed <T> {};
- template <typename T> struct is_unsigned_internal <T, false> : public std::is_unsigned <T> {};
- template <typename V> struct is_integral_internal <V, true> : public std::is_integral <typename V::EntryType> {};
- template <typename V> struct is_floating_point_internal<V, true> : public std::is_floating_point<typename V::EntryType> {};
- template <typename V> struct is_signed_internal <V, true> : public std::is_signed <typename V::EntryType> {};
- template <typename V> struct is_unsigned_internal <V, true> : public std::is_unsigned <typename V::EntryType> {};
- template <typename T>
- struct is_arithmetic_internal
- : public std::integral_constant<
- bool,
- (is_floating_point_internal<T>::value || is_integral_internal<T>::value)>
- {
- };
- template <class T, class = void>
- struct vector_size_internal : std::integral_constant<std::size_t, 0> {
- };
- template <class T>
- struct vector_size_internal<T, decltype((void)(T::size() > 0))>
- : std::integral_constant<std::size_t, T::size()> {
- };
- template <typename T>
- struct is_simd_mask : public std::integral_constant<bool,
- (is_simd_mask_internal<decay<T>>::value ||
- is_simd_mask_array_internal<decay<T>>::value)>
- {
- };
- template <typename T>
- struct is_simd_vector
- : public std::integral_constant<bool,
- (is_simd_vector_internal<decay<T>>::value ||
- is_simdarray_internal<decay<T>>::value)>
- {
- };
- template <typename T>
- struct isSimdArray : public is_simdarray_internal<decay<T>>
- {
- };
- template <typename T>
- struct isSimdMaskArray : public is_simd_mask_array_internal<decay<T>>
- {
- };
- template <typename T> struct is_load_store_flag : public is_loadstoreflag_internal<decay<T>> {};
- template <typename T> struct is_atomic_simdarray_internal : public std::false_type {};
- template <typename T> using isAtomicSimdArray = is_atomic_simdarray_internal<decay<T>>;
- template <typename T> struct is_atomic_simd_mask_array_internal : public std::false_type {};
- template <typename T> using isAtomicSimdMaskArray = is_atomic_simd_mask_array_internal<decay<T>>;
- template <typename T> struct simd_vector_size : public vector_size_internal<decay<T>> {};
- template <typename T> struct is_integral : public is_integral_internal<decay<T>> {};
- template <typename T> struct is_floating_point : public is_floating_point_internal<decay<T>> {};
- template <typename T> struct is_arithmetic : public is_arithmetic_internal<decay<T>> {};
- template <typename T> struct is_signed : public is_signed_internal<decay<T>> {};
- template <typename T> struct is_unsigned : public is_unsigned_internal<decay<T>> {};
- template <typename T, bool IsSimdVector> struct scalar_type_internal { using type = T; };
- template <typename T> struct scalar_type_internal<T, true> { using type = typename T::EntryType; };
- template <typename T> using scalar_type = typename scalar_type_internal<decay<T>, is_simd_vector<T>::value>::type;
- }
- }
- #ifndef VC_TRAITS_ENTRY_TYPE_OF_H_
- #define VC_TRAITS_ENTRY_TYPE_OF_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Traits
- {
- namespace entry_type_of_internal
- {
- template <typename T, bool = Traits::is_simd_vector<T>::value> struct entry_type;
- template <typename T> struct entry_type<T, true>
- {
- using type = typename decay<T>::EntryType;
- };
- template <typename T> struct entry_type<T, false>
- {
- using type = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
- };
- }
- template <typename T> using entry_type_of = typename entry_type_of_internal::entry_type<T>::type;
- }
- }
- #endif
- #endif
- #ifndef VC_COMMON_PERMUTATION_H_
- #define VC_COMMON_PERMUTATION_H_
- #ifndef VC_COMMON_MACROS_H_
- #define VC_COMMON_MACROS_H_
- #ifdef Vc_MSVC
- #define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \
- typedef __declspec(align(n_)) type_ new_type_
- #elif __GNUC__
- #define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \
- typedef type_ new_type_[[gnu::aligned(n_)]]
- #else
- #define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \
- using new_type_ alignas(sizeof(n_)) = type_
- #endif
- #ifdef WIN32
- #define NOMINMAX 1
- #if defined min
- #undef min
- #endif
- #if defined max
- #undef max
- #endif
- #endif
- #if defined Vc_GCC && Vc_GCC >= 0x60000
- #define Vc_TEMPLATES_DROP_ATTRIBUTES 1
- #endif
- #if Vc_IS_VERSION_2 || (defined Vc_GCC && Vc_GCC >= 0x60000)
- #define Vc_RECURSIVE_MEMORY 1
- #endif
- #if defined Vc_CLANG || defined Vc_APPLECLANG
- #define Vc_UNREACHABLE __builtin_unreachable
- #define Vc_NEVER_INLINE [[gnu::noinline]]
- #define Vc_INTRINSIC_L inline
- #define Vc_INTRINSIC_R __attribute__((always_inline))
- #define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
- #define Vc_FLATTEN
- #define Vc_CONST __attribute__((const))
- #define Vc_CONST_L
- #define Vc_CONST_R Vc_CONST
- #define Vc_PURE __attribute__((pure))
- #define Vc_PURE_L
- #define Vc_PURE_R Vc_PURE
- #define Vc_MAY_ALIAS __attribute__((may_alias))
- #define Vc_ALWAYS_INLINE_L inline
- #define Vc_ALWAYS_INLINE_R __attribute__((always_inline))
- #define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
- #define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
- #define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
- #define Vc_RESTRICT __restrict__
- #define Vc_DEPRECATED(msg)
- #define Vc_DEPRECATED_ALIAS(msg)
- #define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
- #elif defined(__GNUC__)
- #define Vc_UNREACHABLE __builtin_unreachable
- # if defined Vc_GCC && !defined __OPTIMIZE__
- #define Vc_MAY_ALIAS
- # else
- #define Vc_MAY_ALIAS __attribute__((__may_alias__))
- # endif
- #define Vc_INTRINSIC_R __attribute__((__always_inline__, __artificial__))
- #define Vc_INTRINSIC_L inline
- #define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
- #define Vc_FLATTEN __attribute__((__flatten__))
- #define Vc_ALWAYS_INLINE_L inline
- #define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__))
- #define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
- # ifdef Vc_ICC
- #define Vc_PURE
- #define Vc_CONST
- #define Vc_NEVER_INLINE
- # else
- #define Vc_NEVER_INLINE [[gnu::noinline]]
- #define Vc_PURE __attribute__((__pure__))
- #define Vc_CONST __attribute__((__const__))
- # endif
- #define Vc_CONST_L
- #define Vc_CONST_R Vc_CONST
- #define Vc_PURE_L
- #define Vc_PURE_R Vc_PURE
- #define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
- #define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
- #define Vc_RESTRICT __restrict__
- # ifdef Vc_ICC
- #define Vc_DEPRECATED(msg)
- #define Vc_DEPRECATED_ALIAS(msg)
- # else
- #define Vc_DEPRECATED(msg) __attribute__((__deprecated__(msg)))
- #define Vc_DEPRECATED_ALIAS(msg) __attribute__((__deprecated__(msg)))
- # endif
- #define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
- #else
- #define Vc_NEVER_INLINE
- #define Vc_FLATTEN
- # ifdef Vc_PURE
- #undef Vc_PURE
- # endif
- #define Vc_MAY_ALIAS
- # ifdef Vc_MSVC
- #define Vc_ALWAYS_INLINE inline __forceinline
- #define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE
- #define Vc_ALWAYS_INLINE_R
- #define Vc_CONST __declspec(noalias)
- #define Vc_CONST_L Vc_CONST
- #define Vc_CONST_R
- #define Vc_PURE
- #define Vc_PURE_L Vc_PURE
- #define Vc_PURE_R
- #define Vc_INTRINSIC inline __forceinline
- #define Vc_INTRINSIC_L Vc_INTRINSIC
- #define Vc_INTRINSIC_R
- namespace Vc_VERSIONED_NAMESPACE {
- namespace detail
- {
- static Vc_INTRINSIC void unreachable() { __assume(0); }
- }
- }
- #define Vc_UNREACHABLE Vc::detail::unreachable
- # else
- #define Vc_ALWAYS_INLINE
- #define Vc_ALWAYS_INLINE_L
- #define Vc_ALWAYS_INLINE_R
- #define Vc_CONST
- #define Vc_CONST_L
- #define Vc_CONST_R
- #define Vc_PURE
- #define Vc_PURE_L
- #define Vc_PURE_R
- #define Vc_INTRINSIC
- #define Vc_INTRINSIC_L
- #define Vc_INTRINSIC_R
- #define Vc_UNREACHABLE std::abort
- # endif
- #define Vc_IS_UNLIKELY(x) x
- #define Vc_IS_LIKELY(x) x
- #define Vc_RESTRICT __restrict
- #define Vc_DEPRECATED(msg) __declspec(deprecated(msg))
- #define Vc_DEPRECATED_ALIAS(msg)
- #define Vc_WARN_UNUSED_RESULT
- #endif
- #ifdef Vc_CXX14
- #undef Vc_DEPRECATED
- #define Vc_DEPRECATED(msg_) [[deprecated(msg_)]]
- #endif
- #define Vc_NOTHING_EXPECTING_SEMICOLON static_assert(true, "")
- #define Vc_FREE_STORE_OPERATORS_ALIGNED(align_) \
- \
- \
- \
- Vc_ALWAYS_INLINE void *operator new(size_t size) \
- { \
- return Vc::Common::aligned_malloc<align_>(size); \
- } \
- \
- Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; } \
- \
- Vc_ALWAYS_INLINE void *operator new[](size_t size) \
- { \
- return Vc::Common::aligned_malloc<align_>(size); \
- } \
- \
- Vc_ALWAYS_INLINE void *operator new[](size_t, void *p) { return p; } \
- \
- Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { Vc::Common::free(ptr); } \
- \
- Vc_ALWAYS_INLINE void operator delete(void *, void *) {} \
- \
- Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t) \
- { \
- Vc::Common::free(ptr); \
- } \
- \
- Vc_ALWAYS_INLINE void operator delete[](void *, void *) {} \
- \
- Vc_NOTHING_EXPECTING_SEMICOLON
- #ifdef Vc_ASSERT
- #define Vc_EXTERNAL_ASSERT 1
- #else
- #ifdef NDEBUG
- #define Vc_ASSERT(x)
- #else
- #include <assert.h>
- #define Vc_ASSERT(x) assert(x);
- #endif
- #endif
- #if defined Vc_CLANG || defined Vc_APPLECLANG
- #define Vc_HAS_BUILTIN(x) __has_builtin(x)
- #else
- #define Vc_HAS_BUILTIN(x) 0
- #endif
- #define Vc_CAT_HELPER_(a,b,c,d) a ##b ##c ##d
- #define Vc_CAT(a,b,c,d) Vc_CAT_HELPER_(a, b, c, d)
- #define Vc_CAT_IMPL(a,b) a ##b
- #define Vc_CAT2(a,b) Vc_CAT_IMPL(a, b)
- #define Vc_APPLY_IMPL_1_(macro,a,b,c,d,e) macro(a)
- #define Vc_APPLY_IMPL_2_(macro,a,b,c,d,e) macro(a, b)
- #define Vc_APPLY_IMPL_3_(macro,a,b,c,d,e) macro(a, b, c)
- #define Vc_APPLY_IMPL_4_(macro,a,b,c,d,e) macro(a, b, c, d)
- #define Vc_APPLY_IMPL_5_(macro,a,b,c,d,e) macro(a, b, c, d, e)
- #define Vc_LIST_FLOAT_VECTOR_TYPES(size,macro,a,b,c,d) \
- size(macro, double_v, a, b, c, d) \
- size(macro, float_v, a, b, c, d)
- #define Vc_LIST_INT_VECTOR_TYPES(size,macro,a,b,c,d) \
- size(macro, int_v, a, b, c, d) \
- size(macro, uint_v, a, b, c, d) \
- size(macro, short_v, a, b, c, d) \
- size(macro, ushort_v, a, b, c, d)
- #define Vc_LIST_VECTOR_TYPES(size,macro,a,b,c,d) \
- Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \
- Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d)
- #define Vc_LIST_COMPARES(size,macro,a,b,c,d) \
- size(macro, ==, a, b, c, d) \
- size(macro, !=, a, b, c, d) \
- size(macro, <=, a, b, c, d) \
- size(macro, >=, a, b, c, d) \
- size(macro, < , a, b, c, d) \
- size(macro, > , a, b, c, d)
- #define Vc_LIST_LOGICAL(size,macro,a,b,c,d) \
- size(macro, &&, a, b, c, d) \
- size(macro, ||, a, b, c, d)
- #define Vc_LIST_BINARY(size,macro,a,b,c,d) \
- size(macro, |, a, b, c, d) \
- size(macro, &, a, b, c, d) \
- size(macro, ^, a, b, c, d)
- #define Vc_LIST_SHIFTS(size,macro,a,b,c,d) \
- size(macro, <<, a, b, c, d) \
- size(macro, >>, a, b, c, d)
- #define Vc_LIST_ARITHMETICS(size,macro,a,b,c,d) \
- size(macro, +, a, b, c, d) \
- size(macro, -, a, b, c, d) \
- size(macro, *, a, b, c, d) \
- size(macro, /, a, b, c, d) \
- size(macro, %, a, b, c, d)
- #define Vc_APPLY_0(_list,macro) _list(Vc_APPLY_IMPL_1_, macro, 0, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
- #define Vc_APPLY_1(_list,macro,a) _list(Vc_APPLY_IMPL_2_, macro, a, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
- #define Vc_APPLY_2(_list,macro,a,b) _list(Vc_APPLY_IMPL_3_, macro, a, b, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
- #define Vc_APPLY_3(_list,macro,a,b,c) _list(Vc_APPLY_IMPL_4_, macro, a, b, c, 0) Vc_NOTHING_EXPECTING_SEMICOLON
- #define Vc_APPLY_4(_list,macro,a,b,c,d) _list(Vc_APPLY_IMPL_5_, macro, a, b, c, d) Vc_NOTHING_EXPECTING_SEMICOLON
- #define Vc_ALL_COMPARES(macro) Vc_APPLY_0(Vc_LIST_COMPARES, macro)
- #define Vc_ALL_LOGICAL(macro) Vc_APPLY_0(Vc_LIST_LOGICAL, macro)
- #define Vc_ALL_BINARY(macro) Vc_APPLY_0(Vc_LIST_BINARY, macro)
- #define Vc_ALL_SHIFTS(macro) Vc_APPLY_0(Vc_LIST_SHIFTS, macro)
- #define Vc_ALL_ARITHMETICS(macro) Vc_APPLY_0(Vc_LIST_ARITHMETICS, macro)
- #define Vc_ALL_FLOAT_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_FLOAT_VECTOR_TYPES, macro)
- #define Vc_ALL_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_VECTOR_TYPES, macro)
- #define Vc_EXACT_TYPE(_test,_reference,_type) \
- typename std::enable_if<std::is_same<_test, _reference>::value, _type>::type
- #define Vc_make_unique(name) Vc_CAT(Vc_,name,_,__LINE__)
- #if defined(Vc_ICC) || defined(Vc_CLANG) || defined Vc_APPLECLANG
- #define Vc_OFFSETOF(Type,member) (reinterpret_cast<const char *>(&reinterpret_cast<const Type *>(0)->member) - reinterpret_cast<const char *>(0))
- #else
- #define Vc_OFFSETOF(Type,member) offsetof(Type, member)
- #endif
- #if defined(Vc_NO_NOEXCEPT)
- #define Vc_NOEXCEPT throw()
- #else
- #define Vc_NOEXCEPT noexcept
- #endif
- #ifdef Vc_NO_ALWAYS_INLINE
- #undef Vc_ALWAYS_INLINE
- #undef Vc_ALWAYS_INLINE_L
- #undef Vc_ALWAYS_INLINE_R
- #define Vc_ALWAYS_INLINE inline
- #define Vc_ALWAYS_INLINE_L inline
- #define Vc_ALWAYS_INLINE_R
- #undef Vc_INTRINSIC
- #undef Vc_INTRINSIC_L
- #undef Vc_INTRINSIC_R
- #define Vc_INTRINSIC inline
- #define Vc_INTRINSIC_L inline
- #define Vc_INTRINSIC_R
- #endif
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Permutation
- {
- struct ReversedTag {};
- constexpr ReversedTag Reversed{};
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- using std::size_t;
- using llong = long long;
- using ullong = unsigned long long;
- using ulong = unsigned long;
- using uint = unsigned int;
- using ushort = unsigned short;
- using uchar = unsigned char;
- using schar = signed char;
- struct VectorSpecialInitializerZero {};
- struct VectorSpecialInitializerOne {};
- struct VectorSpecialInitializerIndexesFromZero {};
- constexpr VectorSpecialInitializerZero Zero = {};
- constexpr VectorSpecialInitializerOne One = {};
- constexpr VectorSpecialInitializerIndexesFromZero IndexesFromZero = {};
- namespace Detail
- {
- template<typename T> struct MayAliasImpl {
- #ifdef __GNUC__
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wattributes"
- #endif
- typedef T type Vc_MAY_ALIAS;
- #ifdef __GNUC__
- #pragma GCC diagnostic pop
- #endif
- };
- }
- #ifdef Vc_ICC
- template <typename T> using MayAlias [[gnu::may_alias]] = T;
- #else
- template <typename T> using MayAlias = typename Detail::MayAliasImpl<T>::type;
- #endif
- template <class To, class From> MayAlias<To> &aliasing_cast(From &x)
- {
- return *reinterpret_cast<MayAlias<To> *>(&x);
- }
- template <class To, class From> const MayAlias<To> &aliasing_cast(const From &x)
- {
- return *reinterpret_cast<const MayAlias<To> *>(&x);
- }
- template <class To, class From> MayAlias<To> *aliasing_cast(From *x)
- {
- return reinterpret_cast<MayAlias<To> *>(x);
- }
- template <class To, class From> const MayAlias<To> *aliasing_cast(const From *x)
- {
- return reinterpret_cast<const MayAlias<To> *>(x);
- }
- enum class Operator : char {
- Assign,
- Multiply,
- MultiplyAssign,
- Divide,
- DivideAssign,
- Remainder,
- RemainderAssign,
- Plus,
- PlusAssign,
- Minus,
- MinusAssign,
- RightShift,
- RightShiftAssign,
- LeftShift,
- LeftShiftAssign,
- And,
- AndAssign,
- Xor,
- XorAssign,
- Or,
- OrAssign,
- PreIncrement,
- PostIncrement,
- PreDecrement,
- PostDecrement,
- LogicalAnd,
- LogicalOr,
- Comma,
- UnaryPlus,
- UnaryMinus,
- UnaryNot,
- UnaryOnesComplement,
- CompareEqual,
- CompareNotEqual,
- CompareLess,
- CompareGreater,
- CompareLessEqual,
- CompareGreaterEqual
- };
- template <typename T, std::size_t N> struct array;
- namespace Common {
- template <typename T, std::ptrdiff_t N> class span;
- }
- #ifndef Vc_CHECK_ALIGNMENT
- template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *){}
- #else
- template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *ptr)
- {
- const size_t s = alignof(_T);
- if((reinterpret_cast<size_t>(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) {
- fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n");
- abort();
- }
- }
- #endif
- namespace Common
- {
- template <typename T, std::size_t Pieces, std::size_t Index> struct Segment;
- template<size_t StructSize> class SuccessiveEntries
- {
- #ifdef Vc_MSVC
- using size_type = unsigned;
- #else
- using size_type = size_t;
- #endif
- const size_type m_first;
- public:
- typedef SuccessiveEntries AsArg;
- Vc_INTRINSIC SuccessiveEntries(size_type first) : m_first(first) {}
- Vc_INTRINSIC Vc_PURE size_type operator[](size_type offset) const
- {
- return m_first + offset * StructSize;
- }
- Vc_INTRINSIC Vc_PURE size_type data() const { return m_first; }
- Vc_INTRINSIC Vc_PURE SuccessiveEntries operator+(const SuccessiveEntries &rhs) const
- {
- return SuccessiveEntries(m_first + rhs.m_first);
- }
- Vc_INTRINSIC Vc_PURE SuccessiveEntries operator*(const SuccessiveEntries &rhs) const
- {
- return SuccessiveEntries(m_first * rhs.m_first);
- }
- Vc_INTRINSIC Vc_PURE SuccessiveEntries operator<<(size_type x) const
- {
- return {m_first << x};
- }
- friend Vc_INTRINSIC SuccessiveEntries &internal_data(SuccessiveEntries &x)
- {
- return x;
- }
- friend Vc_INTRINSIC const SuccessiveEntries &internal_data(const SuccessiveEntries &x)
- {
- return x;
- }
- };
- template <std::size_t alignment>
- Vc_INTRINSIC_L void *aligned_malloc(std::size_t n) Vc_INTRINSIC_R;
- Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R;
- template <typename Mask, typename T, typename U>
- using enable_if_mask_converts_implicitly =
- enable_if<(!std::is_same<Mask, Traits::decay<U>>::value &&
- Traits::is_simd_mask<U>::value && !Traits::isSimdMaskArray<U>::value &&
- Traits::is_implicit_cast_allowed_mask<
- Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value)>;
- template <typename T, typename U>
- using enable_if_mask_converts_explicitly = enable_if<(
- Traits::isSimdMaskArray<U>::value ||
- (Traits::is_simd_mask<U>::value &&
- !Traits::is_implicit_cast_allowed_mask<
- Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value))>;
- template <typename T> using WidthT = std::integral_constant<std::size_t, sizeof(T)>;
- template <std::size_t Bytes> class MaskBool;
- template <typename T, typename IndexVector, typename Scale, bool>
- class SubscriptOperation;
- template <class T, class IndexVector, int Scale = 1>
- struct GatherArguments {
- static_assert(std::is_same<T, remove_cvref_t<T>>::value && !std::is_pointer<T>::value,
- "GatherArguments expects an cv unqualified non-ref/ptr type");
- const IndexVector indexes;
- const T *const address;
- };
- template <int Scale, class T, class I>
- GatherArguments<T, I, Scale> make_gather(const T *m, const I &i)
- {
- return {i, m};
- }
- template <typename T, typename IndexVector> struct ScatterArguments
- {
- const IndexVector indexes;
- T *const address;
- };
- template <typename I, I Begin, I End, typename F>
- Vc_INTRINSIC enable_if<(Begin >= End), void> unrolled_loop(F &&)
- {
- }
- template <typename I, I Begin, I End, typename F>
- Vc_INTRINSIC Vc_FLATTEN enable_if<(Begin < End), void> unrolled_loop(F &&f)
- {
- f(Begin);
- unrolled_loop<I, Begin + 1, End>(f);
- }
- template <std::size_t Size, typename F> Vc_INTRINSIC void for_all_vector_entries(F &&f)
- {
- unrolled_loop<std::size_t, 0u, Size>(std::forward<F>(f));
- }
- }
- }
- #ifndef VC_COMMON_VECTOR_H_
- #define VC_COMMON_VECTOR_H_
- #include <ratio>
- #ifndef VC_COMMON_ELEMENTREFERENCE_H_
- #define VC_COMMON_ELEMENTREFERENCE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <typename U, typename Accessor = U> class ElementReference
- {
- friend U;
- friend Accessor;
- Vc_INTRINSIC ElementReference(U &o, int i) noexcept : index(i), obj(o) {}
- static constexpr bool get_noexcept =
- noexcept(Accessor::get(std::declval<U &>(), int()));
- template <typename T> static constexpr bool set_noexcept()
- {
- return noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>()));
- }
- public:
- using value_type = typename U::value_type;
- Vc_INTRINSIC ElementReference(const ElementReference &) = delete;
- Vc_INTRINSIC ElementReference(ElementReference &&) = default;
- Vc_INTRINSIC operator value_type() const noexcept(get_noexcept)
- {
- return Accessor::get(obj, index);
- }
- template <typename T>
- Vc_INTRINSIC ElementReference &operator=(T &&x) &&
- noexcept(noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>())))
- {
- Accessor::set(obj, index, std::forward<T>(x));
- return *this;
- }
- #define Vc_OP_(op_) \
- template <typename T, typename R = decltype(std::declval<const value_type &>() \
- op_ std::declval<T>())> \
- Vc_INTRINSIC ElementReference &operator op_##=(T &&x) && \
- noexcept(get_noexcept && noexcept(Accessor::set(std::declval<U &>(), int(), \
- std::declval<R &&>()))) \
- { \
- const value_type &lhs = Accessor::get(obj, index); \
- Accessor::set(obj, index, lhs op_ std::forward<T>(x)); \
- return *this; \
- }
- Vc_ALL_ARITHMETICS(Vc_OP_);
- Vc_ALL_SHIFTS(Vc_OP_);
- Vc_ALL_BINARY(Vc_OP_);
- #undef Vc_OP_
- template <typename = void>
- Vc_INTRINSIC ElementReference &operator++() &&
- noexcept(noexcept(std::declval<value_type &>() =
- Accessor::get(std::declval<U &>(), int())) &&
- set_noexcept<decltype(++std::declval<value_type &>())>())
- {
- value_type x = Accessor::get(obj, index);
- Accessor::set(obj, index, ++x);
- return *this;
- }
- template <typename = void>
- Vc_INTRINSIC value_type operator++(int) &&
- noexcept(noexcept(std::declval<value_type &>() =
- Accessor::get(std::declval<U &>(), int())) &&
- set_noexcept<decltype(std::declval<value_type &>()++)>())
- {
- const value_type r = Accessor::get(obj, index);
- value_type x = r;
- Accessor::set(obj, index, ++x);
- return r;
- }
- template <typename = void>
- Vc_INTRINSIC ElementReference &operator--() &&
- noexcept(noexcept(std::declval<value_type &>() =
- Accessor::get(std::declval<U &>(), int())) &&
- set_noexcept<decltype(--std::declval<value_type &>())>())
- {
- value_type x = Accessor::get(obj, index);
- Accessor::set(obj, index, --x);
- return *this;
- }
- template <typename = void>
- Vc_INTRINSIC value_type operator--(int) &&
- noexcept(noexcept(std::declval<value_type &>() =
- Accessor::get(std::declval<U &>(), int())) &&
- set_noexcept<decltype(std::declval<value_type &>()--)>())
- {
- const value_type r = Accessor::get(obj, index);
- value_type x = r;
- Accessor::set(obj, index, --x);
- return r;
- }
- friend void swap(ElementReference &&a, ElementReference &&b) {
- value_type tmp(a);
- static_cast<ElementReference &&>(a) = static_cast<value_type>(b);
- static_cast<ElementReference &&>(b) = tmp;
- }
- friend void swap(value_type &a, ElementReference &&b) {
- value_type tmp(a);
- a = static_cast<value_type>(b);
- static_cast<ElementReference &&>(b) = tmp;
- }
- friend void swap(ElementReference &&a, value_type &b) {
- value_type tmp(a);
- static_cast<ElementReference &&>(a) = b;
- b = tmp;
- }
- private:
- int index;
- U &obj;
- };
- }
- }
- #endif
- #ifndef VC_COMMON_VECTORABI_H_
- #define VC_COMMON_VECTORABI_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace VectorAbi
- {
- template <typename T>
- using Avx1Abi = typename std::conditional<std::is_integral<T>::value, VectorAbi::Sse,
- VectorAbi::Avx>::type;
- template <typename T> struct DeduceCompatible {
- #ifdef __x86_64__
- using type = Sse;
- #else
- using type = Scalar;
- #endif
- };
- template <typename T>
- struct DeduceBest {
- using type = typename std::conditional<
- CurrentImplementation::is(ScalarImpl), Scalar,
- typename std::conditional<
- CurrentImplementation::is_between(SSE2Impl, SSE42Impl), Sse,
- typename std::conditional<
- CurrentImplementation::is(AVXImpl), Avx1Abi<T>,
- typename std::conditional<CurrentImplementation::is(AVX2Impl), Avx,
- void>::type>::type>::type>::type;
- };
- template <typename T> using Best = typename DeduceBest<T>::type;
- }
- }
- #ifndef VC_COMMON_SIMDARRAYFWD_H_
- #define VC_COMMON_SIMDARRAYFWD_H_
- #ifndef VC_SSE_TYPES_H_
- #define VC_SSE_TYPES_H_
- #ifdef Vc_DEFAULT_IMPL_SSE
- #define Vc_DOUBLE_V_SIZE 2
- #define Vc_FLOAT_V_SIZE 4
- #define Vc_INT_V_SIZE 4
- #define Vc_UINT_V_SIZE 4
- #define Vc_SHORT_V_SIZE 8
- #define Vc_USHORT_V_SIZE 8
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SSE
- {
- template <typename T> using Vector = Vc::Vector<T, VectorAbi::Sse>;
- typedef Vector<double> double_v;
- typedef Vector<float> float_v;
- typedef Vector<int> int_v;
- typedef Vector<unsigned int> uint_v;
- typedef Vector<short> short_v;
- typedef Vector<unsigned short> ushort_v;
- template <typename T> using Mask = Vc::Mask<T, VectorAbi::Sse>;
- typedef Mask<double> double_m;
- typedef Mask<float> float_m;
- typedef Mask<int> int_m;
- typedef Mask<unsigned int> uint_m;
- typedef Mask<short> short_m;
- typedef Mask<unsigned short> ushort_m;
- template <typename T> struct Const;
- template <typename T> struct is_vector : public std::false_type {};
- template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
- template <typename T> struct is_mask : public std::false_type {};
- template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
- }
- namespace Traits
- {
- template <class T> struct
- is_simd_vector_internal<Vector<T, VectorAbi::Sse>>
- : public is_valid_vector_argument<T> {};
- template<typename T> struct is_simd_mask_internal<Mask<T, VectorAbi::Sse>>
- : public std::true_type {};
- }
- }
- #endif
- #ifndef VC_AVX_TYPES_H_
- #define VC_AVX_TYPES_H_
- #ifndef VC_AVX_MACROS_H_
- #define VC_AVX_MACROS_H_
- #endif
- #ifdef Vc_DEFAULT_IMPL_AVX2
- #define Vc_DOUBLE_V_SIZE 4
- #define Vc_FLOAT_V_SIZE 8
- #define Vc_INT_V_SIZE 8
- #define Vc_UINT_V_SIZE 8
- #define Vc_SHORT_V_SIZE 16
- #define Vc_USHORT_V_SIZE 16
- #elif defined Vc_DEFAULT_IMPL_AVX
- #define Vc_DOUBLE_V_SIZE 4
- #define Vc_FLOAT_V_SIZE 8
- #define Vc_INT_V_SIZE 4
- #define Vc_UINT_V_SIZE 4
- #define Vc_SHORT_V_SIZE 8
- #define Vc_USHORT_V_SIZE 8
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace AVX
- {
- template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx1Abi<T>>;
- typedef Vector<double> double_v;
- typedef Vector<float> float_v;
- typedef Vector<int> int_v;
- typedef Vector<unsigned int> uint_v;
- typedef Vector<short> short_v;
- typedef Vector<unsigned short> ushort_v;
- template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx1Abi<T>>;
- typedef Mask<double> double_m;
- typedef Mask<float> float_m;
- typedef Mask<int> int_m;
- typedef Mask<unsigned int> uint_m;
- typedef Mask<short> short_m;
- typedef Mask<unsigned short> ushort_m;
- template <typename T> struct Const;
- template <typename T> struct is_vector : public std::false_type {};
- template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
- template <typename T> struct is_mask : public std::false_type {};
- template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
- }
- namespace AVX2
- {
- template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx>;
- using double_v = Vector<double>;
- using float_v = Vector< float>;
- using int_v = Vector< int>;
- using uint_v = Vector< uint>;
- using short_v = Vector< short>;
- using ushort_v = Vector<ushort>;
- template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx>;
- using double_m = Mask<double>;
- using float_m = Mask< float>;
- using llong_m = Mask< llong>;
- using ullong_m = Mask<ullong>;
- using long_m = Mask< long>;
- using ulong_m = Mask< ulong>;
- using int_m = Mask< int>;
- using uint_m = Mask< uint>;
- using short_m = Mask< short>;
- using ushort_m = Mask<ushort>;
- using schar_m = Mask< schar>;
- using uchar_m = Mask< uchar>;
- template <typename T> struct is_vector : public std::false_type {};
- template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
- template <typename T> struct is_mask : public std::false_type {};
- template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
- }
- namespace Traits
- {
- template <class T> struct
- is_simd_vector_internal<Vector<T, VectorAbi::Avx>>
- : public is_valid_vector_argument<T> {};
- template<typename T> struct is_simd_mask_internal<Mask<T, VectorAbi::Avx>>
- : public std::true_type {};
- }
- }
- #endif
- #ifndef VC_COMMON_UTILITY_H_
- #define VC_COMMON_UTILITY_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- template <size_t x, bool = (x & (x - 1)) == 0> struct NextPowerOfTwo;
- template <size_t x>
- struct NextPowerOfTwo<x, true> : public std::integral_constant<size_t, x> {
- };
- template <size_t x>
- struct NextPowerOfTwo<x, false>
- : public std::integral_constant<
- size_t, NextPowerOfTwo<(x | (x >> 1) | (x >> 2) | (x >> 5)) + 1>::value> {
- };
- template <size_t A>
- struct BoundedAlignment : public std::integral_constant<size_t,
- #if defined Vc_MSVC || defined Vc_GCC
- ((A - 1) &
- #ifdef Vc_MSVC
- 31
- #elif defined __AVX__
- 255
- #else
- 127
- #endif
- ) + 1
- #else
- A
- #endif
- > {
- };
- template <std::size_t N> static constexpr std::size_t left_size()
- {
- return Common::NextPowerOfTwo<(N + 1) / 2>::value;
- }
- template <std::size_t N> static constexpr std::size_t right_size()
- {
- return N - left_size<N>();
- }
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <class T, int N>
- class Vector<T, simd_abi::fixed_size<N>> : public SimdArray<T, N>
- {
- using SimdArray<T, N>::SimdArray;
- public:
- Vc_INTRINSIC Vector(const Vector &x) : SimdArray<T, N>(x) {}
- Vc_INTRINSIC Vector &operator=(const Vector &x)
- {
- SimdArray<T, N>::operator=(x);
- return *this;
- }
- Vector() = default;
- using abi_type = simd_abi::fixed_size<N>;
- using abi = abi_type;
- Vc_DEPRECATED("use Vector([](int n) { return n; }) instead of "
- "Vector::IndexesFromZero()") static Vector IndexesFromZero()
- {
- return Vector([](size_t i) -> T { return i; });
- }
- Vc_DEPRECATED("use 0 instead of Vector::Zero()") static Vector Zero() { return 0; }
- Vc_DEPRECATED("use 1 instead of Vector::One()") static Vector One() { return 1; }
- };
- template <class T, int N>
- class Mask<T, simd_abi::fixed_size<N>> : public SimdMaskArray<T, N>
- {
- using SimdMaskArray<T, N>::SimdMaskArray;
- public:
- Vc_INTRINSIC Mask(const Mask &x) : SimdMaskArray<T, N>(x) {}
- Vc_INTRINSIC Mask &operator=(const Mask &x)
- {
- SimdMaskArray<T, N>::operator=(x);
- return *this;
- }
- Mask() = default;
- using abi_type = simd_abi::fixed_size<N>;
- using abi = abi_type;
- };
- template <typename T, std::size_t N> struct SimdArrayTraits {
- static constexpr std::size_t N0 = Common::left_size<N>();
- static constexpr std::size_t N1 = Common::right_size<N>();
- using storage_type0 = fixed_size_simd<T, N0>;
- using storage_type1 = fixed_size_simd<T, N1>;
- };
- template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
- Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
- SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
- template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
- Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
- SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
- template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
- Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
- const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
- template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
- Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
- const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
- template <typename T, std::size_t N, typename V>
- Vc_INTRINSIC_L V &internal_data(SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
- template <typename T, std::size_t N, typename V>
- Vc_INTRINSIC_L const V &internal_data(const SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
- namespace Traits
- {
- template <class T> struct is_fixed_size_simd : std::false_type {
- };
- template <class T, int N>
- struct is_fixed_size_simd<fixed_size_simd<T, N>> : std::true_type {
- };
- template <class T, int N>
- struct is_fixed_size_simd<fixed_size_simd_mask<T, N>> : std::true_type {
- };
- template <class T, int N>
- struct is_simd_vector_internal<fixed_size_simd<T, N>> : is_valid_vector_argument<T> {};
- template <class T, int N>
- struct is_simd_mask_internal<fixed_size_simd_mask<T, N>> : is_valid_vector_argument<T> {};
- template <typename T, std::size_t N, typename V>
- struct is_atomic_simdarray_internal<SimdArray<T, N, V, N>> : is_valid_vector_argument<T> {};
- template <typename T, int N>
- struct is_atomic_simdarray_internal<fixed_size_simd<T, N>>
- : is_atomic_simdarray_internal<SimdArray<T, N>> {
- };
- template <typename T, std::size_t N, typename V>
- struct is_atomic_simd_mask_array_internal<SimdMaskArray<T, N, V, N>>
- : is_valid_vector_argument<T> {
- };
- template <typename T, int N>
- struct is_atomic_simd_mask_array_internal<fixed_size_simd_mask<T, N>>
- : is_atomic_simd_mask_array_internal<SimdMaskArray<T, N>> {
- };
- template <typename T, std::size_t N, typename VectorType, std::size_t M>
- struct is_simdarray_internal<SimdArray<T, N, VectorType, M>>
- : is_valid_vector_argument<T> {
- };
- template <typename T, int N>
- struct is_simdarray_internal<fixed_size_simd<T, N>> : is_valid_vector_argument<T> {
- };
- template <typename T, std::size_t N, typename VectorType, std::size_t M>
- struct is_simd_mask_array_internal<SimdMaskArray<T, N, VectorType, M>>
- : is_valid_vector_argument<T> {
- };
- template <typename T, int N>
- struct is_simd_mask_array_internal<fixed_size_simd_mask<T, N>>
- : is_valid_vector_argument<T> {
- };
- template <typename T, std::size_t N, typename V, std::size_t M>
- struct is_integral_internal<SimdArray<T, N, V, M>, false> : std::is_integral<T> {
- };
- template <typename T, std::size_t N, typename V, std::size_t M>
- struct is_floating_point_internal<SimdArray<T, N, V, M>, false>
- : std::is_floating_point<T> {
- };
- template <typename T, std::size_t N, typename V, std::size_t M>
- struct is_signed_internal<SimdArray<T, N, V, M>, false> : std::is_signed<T> {
- };
- template <typename T, std::size_t N, typename V, std::size_t M>
- struct is_unsigned_internal<SimdArray<T, N, V, M>, false> : std::is_unsigned<T> {
- };
- template <typename T, std::size_t N>
- struct has_no_allocated_data_impl<Vc::SimdArray<T, N>> : std::true_type {
- };
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace detail
- {
- template <class T> struct is_fixed_size_abi : std::false_type {
- };
- template <int N> struct is_fixed_size_abi<simd_abi::fixed_size<N>> : std::true_type {
- };
- template <class T>
- using not_fixed_size_abi = typename std::enable_if<!is_fixed_size_abi<T>::value, T>::type;
- }
- }
- #endif
- #ifndef VC_COMMON_VECTORTRAITS_H_
- #define VC_COMMON_VECTORTRAITS_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename T, typename Abi> struct VectorTraits;
- }
- #endif
- #ifndef VC_COMMON_LOADSTOREFLAGS_H_
- #define VC_COMMON_LOADSTOREFLAGS_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- struct Exclusive {};
- struct Shared {};
- namespace LoadStoreFlags
- {
- struct StreamingFlag {};
- struct UnalignedFlag {};
- struct PrefetchFlagBase {};
- template <size_t L1 = 16 * 64, size_t L2 = 128 * 64, typename ExclusiveOrShared_ = void>
- struct PrefetchFlag : public PrefetchFlagBase {
- typedef ExclusiveOrShared_ ExclusiveOrShared;
- static constexpr size_t L1Stride = L1;
- static constexpr size_t L2Stride = L2;
- static constexpr bool IsExclusive = std::is_same<ExclusiveOrShared, Exclusive>::value;
- static constexpr bool IsShared = std::is_same<ExclusiveOrShared, Shared>::value;
- };
- template<typename Base, typename Default, typename... LoadStoreFlags> struct ExtractType
- {
- typedef Default type;
- };
- template<typename Base, typename Default, typename T, typename... LoadStoreFlags> struct ExtractType<Base, Default, T, LoadStoreFlags...>
- {
- typedef typename std::conditional<std::is_base_of<Base, T>::value, T, typename ExtractType<Base, Default, LoadStoreFlags...>::type>::type type;
- };
- #ifdef Vc_ICC
- #pragma warning(disable: 177)
- #endif
- template<typename... Flags> struct LoadStoreFlags
- {
- private:
- typedef typename ExtractType<PrefetchFlagBase, PrefetchFlag<0, 0>, Flags...>::type Prefetch;
- public:
- constexpr LoadStoreFlags() {}
- static constexpr bool IsStreaming = !std::is_same<typename ExtractType<StreamingFlag, void, Flags...>::type, void>::value;
- static constexpr bool IsUnaligned = !std::is_same<typename ExtractType<UnalignedFlag, void, Flags...>::type, void>::value;
- static constexpr bool IsAligned = !IsUnaligned;
- static constexpr bool IsPrefetch = !std::is_same<typename ExtractType<PrefetchFlagBase, void, Flags...>::type, void>::value;
- static constexpr bool IsExclusivePrefetch = Prefetch::IsExclusive;
- static constexpr bool IsSharedPrefetch = Prefetch::IsShared;
- static constexpr size_t L1Stride = Prefetch::L1Stride;
- static constexpr size_t L2Stride = Prefetch::L2Stride;
- typedef LoadStoreFlags<typename std::conditional<std::is_same<Flags, UnalignedFlag>::value, void, Flags>::type...> UnalignedRemoved;
- typedef typename std::conditional<IsAligned && !IsStreaming, void *, void>::type EnableIfAligned;
- typedef typename std::conditional<IsAligned && IsStreaming, void *, void>::type EnableIfStreaming;
- typedef typename std::conditional<IsUnaligned && !IsStreaming, void *, void>::type EnableIfUnalignedNotStreaming;
- typedef typename std::conditional<IsUnaligned && IsStreaming, void *, void>::type EnableIfUnalignedAndStreaming;
- typedef typename std::conditional<IsUnaligned , void *, void>::type EnableIfUnaligned;
- typedef typename std::conditional<!IsUnaligned , void *, void>::type EnableIfNotUnaligned;
- typedef typename std::conditional<IsPrefetch , void *, void>::type EnableIfPrefetch;
- typedef typename std::conditional<!IsPrefetch , void *, void>::type EnableIfNotPrefetch;
- };
- template<> struct LoadStoreFlags<>
- {
- constexpr LoadStoreFlags() {}
- static constexpr bool IsStreaming = false;
- static constexpr bool IsUnaligned = false;
- static constexpr bool IsAligned = !IsUnaligned;
- static constexpr bool IsPrefetch = false;
- static constexpr bool IsExclusivePrefetch = false;
- static constexpr bool IsSharedPrefetch = false;
- static constexpr size_t L1Stride = 0;
- static constexpr size_t L2Stride = 0;
- typedef void* EnableIfAligned;
- typedef void* EnableIfNotUnaligned;
- typedef void* EnableIfNotPrefetch;
- };
- template<typename... LFlags, typename... RFlags>
- constexpr LoadStoreFlags<LFlags..., RFlags...> operator|(LoadStoreFlags<LFlags...>, LoadStoreFlags<RFlags...>)
- {
- return LoadStoreFlags<LFlags..., RFlags...>();
- }
- }
- using LoadStoreFlags::PrefetchFlag;
- typedef LoadStoreFlags::LoadStoreFlags<> AlignedTag;
- typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::StreamingFlag> StreamingTag;
- typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::UnalignedFlag> UnalignedTag;
- typedef UnalignedTag DefaultLoadTag;
- typedef UnalignedTag DefaultStoreTag;
- constexpr AlignedTag Aligned;
- constexpr UnalignedTag Unaligned;
- constexpr StreamingTag Streaming;
- constexpr LoadStoreFlags::LoadStoreFlags<PrefetchFlag<>> PrefetchDefault;
- template <size_t L1 = PrefetchFlag<>::L1Stride,
- size_t L2 = PrefetchFlag<>::L2Stride,
- typename ExclusiveOrShared = PrefetchFlag<>::ExclusiveOrShared>
- struct Prefetch : public LoadStoreFlags::LoadStoreFlags<PrefetchFlag<L1, L2, ExclusiveOrShared>>
- {
- };
- namespace Traits
- {
- template <typename... Ts>
- struct is_loadstoreflag_internal<LoadStoreFlags::LoadStoreFlags<Ts...>> : public std::true_type
- {
- };
- template <size_t L1, size_t L2, typename ExclusiveOrShared>
- struct is_loadstoreflag_internal<Prefetch<L1, L2, ExclusiveOrShared>> : public std::true_type
- {
- };
- }
- }
- #endif
- #ifndef VC_COMMON_WRITEMASKEDVECTOR_H_
- #define VC_COMMON_WRITEMASKEDVECTOR_H_
- #include <utility>
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- template <typename V, typename M = typename V::Mask> class WriteMaskedVector
- {
- static_assert(
- V::Size == M::Size,
- "incorrect use of Vc::Common::WriteMaskedVector<V, M>. V and M must have the same «Size».");
- public:
- typedef M Mask;
- static constexpr size_t Size = V::Size;
- Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask));
- Vc_INTRINSIC WriteMaskedVector(V &v, const Mask &k) : mask(k), vec(v)
- {
- }
- Vc_INTRINSIC V &operator++()
- {
- V one = V::One();
- one.setZeroInverted(mask);
- return vec += one;
- }
- Vc_INTRINSIC V &operator--()
- {
- V one = V::One();
- one.setZeroInverted(mask);
- return vec -= one;
- }
- Vc_INTRINSIC V operator++(int)
- {
- V ret(vec);
- operator++();
- return ret;
- }
- Vc_INTRINSIC V operator--(int)
- {
- V ret(vec);
- operator--();
- return ret;
- }
- #define Vc_OPERATOR_(op) \
- template <typename U> Vc_ALWAYS_INLINE void operator op##=(U &&x) \
- { \
- operator=(static_cast<V>(vec op std::forward<U>(x))); \
- }
- Vc_ALL_BINARY(Vc_OPERATOR_);
- Vc_ALL_ARITHMETICS(Vc_OPERATOR_);
- Vc_ALL_SHIFTS(Vc_OPERATOR_);
- #undef Vc_OPERATOR_
- Vc_ALWAYS_INLINE void operator=(const V &x)
- {
- vec.assign(x, mask);
- }
- template <typename T, typename I, typename S>
- Vc_ALWAYS_INLINE void operator=(SubscriptOperation<T, I, S, true> &&x)
- {
- vec.gather(std::move(x).gatherArguments(), mask);
- }
- template <typename F> Vc_INTRINSIC void call(const F &f) const
- {
- return vec.call(f, mask);
- }
- template <typename F> Vc_INTRINSIC V apply(const F &f) const
- {
- return vec.apply(f, mask);
- }
- template <typename F> Vc_INTRINSIC void call(F &&f) const
- {
- return vec.call(std::forward<F>(f), mask);
- }
- template <typename F> Vc_INTRINSIC V apply(F &&f) const
- {
- return vec.apply(std::forward<F>(f), mask);
- }
- private:
- #ifdef Vc_ICC
- const Mask &mask;
- #else
- const Mask mask;
- #endif
- V &vec;
- };
- }
- }
- #endif
- #ifndef VC_COMMON_DETAIL_H_
- #define VC_COMMON_DETAIL_H_
- #include <vector>
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- template <typename IV>
- Vc_INTRINSIC enable_if<(Traits::is_simd_vector<IV>::value &&
- sizeof(typename IV::EntryType) >= sizeof(int)),
- const IV &>
- convertIndexVector(const IV &indexVector)
- {
- return indexVector;
- }
- template <typename IV>
- Vc_INTRINSIC enable_if<(Traits::is_simd_vector<IV>::value &&
- sizeof(typename IV::EntryType) < sizeof(int)),
- fixed_size_simd<int, IV::Size>>
- convertIndexVector(const IV &indexVector)
- {
- return static_cast<fixed_size_simd<int, IV::Size>>(indexVector);
- }
- template <class T> using promoted_type = decltype(std::declval<T>() + 1);
- template <typename T, std::size_t N>
- Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
- convertIndexVector(const std::array<T, N> &indexVector)
- {
- return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
- Vc::Unaligned};
- }
- template <typename T, std::size_t N>
- Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
- convertIndexVector(const Vc::array<T, N> &indexVector)
- {
- return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
- Vc::Unaligned};
- }
- template <typename T, std::size_t N>
- Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
- convertIndexVector(const T (&indexVector)[N])
- {
- return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
- Vc::Unaligned};
- }
- #ifndef Vc_MSVC
- template <class T>
- enable_if<std::is_pointer<T>::value, void> convertIndexVector(T indexVector) = delete;
- #endif
- template <typename T>
- Vc_INTRINSIC std::vector<promoted_type<T>> convertIndexVector(
- const std::initializer_list<T> &indexVector)
- {
- return {begin(indexVector), end(indexVector)};
- }
- template <typename T>
- Vc_INTRINSIC
- enable_if<(std::is_integral<T>::value && sizeof(T) >= sizeof(int)), std::vector<T>>
- convertIndexVector(const std::vector<T> &indexVector)
- {
- return indexVector;
- }
- template <typename T>
- Vc_INTRINSIC enable_if<(std::is_integral<T>::value && sizeof(T) < sizeof(int)),
- std::vector<promoted_type<T>>>
- convertIndexVector(const std::vector<T> &indexVector)
- {
- return {std::begin(indexVector), std::end(indexVector)};
- }
- template <class T,
- class = enable_if<
- (!std::is_pointer<T>::value && !Traits::is_simd_vector<T>::value &&
- !std::is_lvalue_reference<decltype(std::declval<const T &>()[0])>::value)>>
- Vc_INTRINSIC const T &convertIndexVector(const T &i)
- {
- return i;
- }
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename T, typename Abi,
- typename = enable_if<std::is_floating_point<T>::value &&
- !detail::is_fixed_size_abi<Abi>::value>>
- inline Vector<T, Abi> copysign(Vector<T, Abi> magnitude, Vector<T, Abi> sign);
- template <typename T, typename Abi,
- typename = enable_if<std::is_floating_point<T>::value &&
- !detail::is_fixed_size_abi<Abi>::value>>
- inline Vector<T, Abi> exponent(Vector<T, Abi> x);
- template <typename T, typename Abi>
- Vc_INTRINSIC Vc_CONST typename Vector<T, detail::not_fixed_size_abi<Abi>>::MaskType
- isnegative(Vector<T, Abi> x)
- {
- return x < Vector<T, Abi>::Zero();
- }
- template<typename T, typename Abi = VectorAbi::Best<T>> class Vector
- {
- public:
- static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
- static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::memoryAlignment();
- using abi = Abi;
- using EntryType = typename VectorTraits<T, Abi>::EntryType;
- using value_type = EntryType;
- using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
- using VectorType = typename VectorTraits<T, Abi>::VectorType;
- using vector_type = VectorType;
- using MaskType = Vc::Mask<T, Abi>;
- using mask_type = MaskType;
- using MaskArgument = MaskType;
- using VectorArgument = Vector;
- using IndexType = Vc::fixed_size_simd<int, VectorTraits<T, Abi>::size()>;
- using index_type = IndexType;
- using reference = Detail::ElementReference<Vector>;
- static inline Vector Zero();
- static inline Vector One();
- static inline Vector IndexesFromZero();
- static inline Vector Random();
- template <typename G> static inline Vector generate(G gen);
- inline Vector() = default;
- explicit inline Vector(VectorSpecialInitializerZero);
- explicit inline Vector(VectorSpecialInitializerOne);
- explicit inline Vector(VectorSpecialInitializerIndexesFromZero);
- template <typename U>
- inline Vector(Vector<U, abi> x,
- enable_if<Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
- #if Vc_IS_VERSION_1
- template <typename U>
- Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
- "vector types") inline explicit Vector(
- Vector<U, abi> x,
- enable_if<!Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
- #endif
- inline Vector(EntryType a);
- template <typename U>
- inline Vector(U a, enable_if<std::is_same<U, int>::value &&
- !std::is_same<U, EntryType>::value> = nullarg);
- explicit Vc_INTRINSIC Vector(const EntryType *mem)
- {
- load(mem);
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
- {
- load(mem, flags);
- }
- template <typename U, typename Flags = DefaultLoadTag,
- typename = enable_if<
- (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
- sizeof(EntryType) >= sizeof(U)) &&
- std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
- {
- load<U, Flags>(x, flags);
- }
- Vc_INTRINSIC void load(const EntryType *mem)
- {
- load(mem, DefaultLoadTag());
- }
- template <typename Flags>
- Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
- load(const EntryType *mem, Flags flags)
- {
- load<EntryType, Flags>(mem, flags);
- }
- private:
- template <typename U, typename Flags>
- struct load_concept : public std::enable_if<
- (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
- sizeof(EntryType) >= sizeof(U)) &&
- std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
- {};
- public:
- template <typename U, typename Flags = DefaultLoadTag>
- Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
- template <
- typename U,
- typename Flags = DefaultStoreTag,
- typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
- template <
- typename U,
- typename Flags = DefaultStoreTag,
- typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
- Vc_INTRINSIC void store(EntryType *mem) const
- {
- store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
- {
- store<EntryType, Flags>(mem, flags);
- }
- Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
- {
- store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
- {
- store<EntryType, Flags>(mem, mask, flags);
- }
- inline void setZero();
- inline void setZero(MaskType mask);
- inline void setZeroInverted(MaskType mask);
- inline void setQnan();
- inline void setQnan(MaskType mask);
- #define Vc_CURRENT_CLASS_NAME Vector
- #ifndef Vc_CURRENT_CLASS_NAME
- #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
- #endif
- private:
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
- MaskArgument mask);
- public:
- #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<MT, EntryType>::value, \
- "The memory pointer needs to point to a type that can be converted to the " \
- "EntryType of this SIMD vector type."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT, typename IT,
- typename = enable_if<Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
- private:
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes) const;
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
- public:
- #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<EntryType, MT>::value, \
- "The memory pointer needs to point to a type that the EntryType of this " \
- "SIMD vector type can be converted to."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes));
- }
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes), mask);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
- {
- scatter(args.address, args.indexes);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
- {
- scatter(args.address, args.indexes, mask);
- }
- #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
- #undef Vc_CURRENT_CLASS_NAME
- inline reference operator[](size_t index) noexcept;
- inline EntryType operator[](size_t index) const noexcept;
- inline MaskType operator!() const;
- inline Vector operator~() const;
- inline Vector operator-() const;
- inline Vector operator+() const;
- inline Vector &operator++();
- inline Vector operator++(int);
- inline Vector &operator--();
- inline Vector operator--(int);
- #define Vc_OP(symbol) \
- inline Vc_PURE Vector operator symbol(const Vector &x) const;
- Vc_ALL_ARITHMETICS(Vc_OP);
- Vc_ALL_BINARY(Vc_OP);
- Vc_ALL_SHIFTS(Vc_OP);
- #undef Vc_OP
- #define Vc_CMP_OP(symbol) inline Vc_PURE MaskType operator symbol(const Vector &x) const;
- Vc_ALL_COMPARES(Vc_CMP_OP);
- #undef Vc_CMP_OP
- inline Common::WriteMaskedVector<Vector, MaskType> operator()(MaskType mask);
- inline EntryType min() const;
- inline EntryType max() const;
- inline EntryType product() const;
- inline EntryType sum() const;
- inline Vector partialSum() const;
- inline EntryType min(MaskType mask) const;
- inline EntryType max(MaskType mask) const;
- inline EntryType product(MaskType mask) const;
- inline EntryType sum(MaskType mask) const;
- inline Vector shifted(int amount) const;
- inline Vector shifted(int amount, Vector shiftIn) const;
- inline Vector rotated(int amount) const;
- inline Vector reversed() const;
- inline Vector sorted() const;
- template <typename F> void callWithValuesSorted(F &&f);
- template <typename F> inline void call(F &&f) const;
- template <typename F> inline void call(F &&f, MaskType mask) const;
- template <typename F> inline Vector apply(F &&f) const;
- template <typename F> inline Vector apply(F &&f, MaskType mask) const;
- template <typename IndexT> inline void fill(EntryType(&f)(IndexT));
- inline void fill(EntryType(&f)());
- inline Vector interleaveLow(Vector x) const;
- inline Vector interleaveHigh(Vector x) const;
- inline void assign(const Vector &v, const MaskType &m);
- inline VectorType &data();
- inline const VectorType &data() const;
- Vc_DEPRECATED("use exponent(x) instead") inline Vector exponent() const;
- Vc_DEPRECATED("use isnegative(x) instead") inline MaskType isNegative() const;
- static constexpr size_t Size = VectorTraits<T, Abi>::size();
- template <typename V2> inline V2 staticCast() const;
- template <typename V2>
- Vc_DEPRECATED("use reinterpret_components_cast instead") inline V2
- reinterpretCast() const;
- Vc_DEPRECATED("use copysign(x, y) instead") inline Vector
- copySign(Vector reference) const;
- Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Vector));
- private:
- VectorType d;
- };
- template <typename V, typename T, typename Abi>
- Vc_ALWAYS_INLINE Vc_CONST enable_if<
- (V::size() == Vector<T, Abi>::size() &&
- sizeof(typename V::VectorEntryType) ==
- sizeof(typename Vector<T, Abi>::VectorEntryType) &&
- sizeof(V) == sizeof(Vector<T, Abi>) && alignof(V) <= alignof(Vector<T, Abi>)),
- V>
- reinterpret_components_cast(const Vector<T, Abi> &x)
- {
- return reinterpret_cast<const V &>(x);
- }
- #define Vc_OP(symbol) \
- template <typename T, typename Abi> \
- inline Vector<T, Abi> &operator symbol##=(Vector<T, Abi> &, \
- const Vector<T, Abi> &x);
- #undef Vc_OP
- }
- #endif
- #ifndef VC_COMMON_MASK_H_
- #define VC_COMMON_MASK_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename T, typename Abi = VectorAbi::Best<T>> class Mask
- {
- public:
- static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
- static constexpr size_t Size = VectorTraits<T, Abi>::size();
- static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::maskMemoryAlignment();
- using abi = Abi;
- using EntryType = bool;
- using value_type = EntryType;
- using EntryReference = typename VectorTraits<T, Abi>::EntryReference;
- using value_reference = EntryReference;
- using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
- using VectorType = typename VectorTraits<T, Abi>::VectorType;
- using vector_type = VectorType;
- Vc_INTRINSIC static Mask Zero();
- Vc_INTRINSIC static Mask One();
- template <typename G> static Vc_INTRINSIC Mask generate(G &&gen);
- Vc_INTRINSIC Mask() = default;
- Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero);
- Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne);
- Vc_INTRINSIC explicit Mask(bool b);
- template <typename U>
- Vc_INTRINSIC Mask(U &&otherMask,
- Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg);
- #if Vc_IS_VERSION_1
- template <typename U>
- Vc_DEPRECATED(
- "use simd_cast instead of explicit type casting to convert between mask types")
- Vc_INTRINSIC_L
- explicit Mask(U &&otherMask, Common::enable_if_mask_converts_explicitly<T, U> =
- nullarg) Vc_INTRINSIC_R;
- #endif
- Vc_ALWAYS_INLINE explicit Mask(const bool *mem);
- template <typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags flags);
- Vc_ALWAYS_INLINE void load(const bool *mem);
- template <typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags flags);
- Vc_ALWAYS_INLINE void store(bool *mem) const;
- template <typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags flags) const;
- Vc_ALWAYS_INLINE bool operator==(const Mask &mask) const;
- Vc_ALWAYS_INLINE bool operator!=(const Mask &mask) const;
- Vc_ALWAYS_INLINE Mask operator&&(const Mask &mask) const;
- Vc_ALWAYS_INLINE Mask operator&(const Mask &mask) const;
- Vc_ALWAYS_INLINE Mask operator||(const Mask &mask) const;
- Vc_ALWAYS_INLINE Mask operator|(const Mask &mask) const;
- Vc_ALWAYS_INLINE Mask operator^(const Mask &mask) const;
- Vc_ALWAYS_INLINE Mask operator!() const;
- Vc_ALWAYS_INLINE Mask &operator&=(const Mask &mask);
- Vc_ALWAYS_INLINE Mask &operator|=(const Mask &mask);
- Vc_ALWAYS_INLINE Mask &operator^=(const Mask &mask);
- Vc_ALWAYS_INLINE bool isFull() const;
- Vc_ALWAYS_INLINE bool isNotEmpty() const;
- Vc_ALWAYS_INLINE bool isEmpty() const;
- Vc_ALWAYS_INLINE bool isMix() const;
- Vc_ALWAYS_INLINE bool data() const;
- Vc_ALWAYS_INLINE bool dataI() const;
- Vc_ALWAYS_INLINE bool dataD() const;
- Vc_ALWAYS_INLINE EntryReference operator[](size_t index);
- Vc_ALWAYS_INLINE EntryType operator[](size_t index) const;
- Vc_ALWAYS_INLINE int count() const;
- Vc_ALWAYS_INLINE int firstOne() const;
- Vc_ALWAYS_INLINE int toInt() const;
- Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const;
- Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask));
- private:
- VectorType d;
- };
- template<typename Mask> constexpr bool all_of(const Mask &m) { return m.isFull(); }
- constexpr bool all_of(bool b) { return b; }
- template<typename Mask> constexpr bool any_of(const Mask &m) { return m.isNotEmpty(); }
- constexpr bool any_of(bool b) { return b; }
- template<typename Mask> constexpr bool none_of(const Mask &m) { return m.isEmpty(); }
- constexpr bool none_of(bool b) { return !b; }
- template<typename Mask> constexpr bool some_of(const Mask &m) { return m.isMix(); }
- constexpr bool some_of(bool) { return false; }
- }
- #endif
- #ifndef VC_COMMON_MEMORYFWD_H_
- #define VC_COMMON_MEMORYFWD_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- template <typename V, std::size_t Size1 = 0, std::size_t Size2 = 0,
- bool InitPadding = true>
- class Memory;
- template <typename V, typename Parent, int Dimension, typename RowMemory>
- class MemoryBase;
- }
- using Common::Memory;
- }
- #endif
- #endif
- #ifndef VC_SCALAR_TYPES_H_
- #define VC_SCALAR_TYPES_H_
- #ifdef Vc_DEFAULT_IMPL_Scalar
- #define Vc_DOUBLE_V_SIZE 1
- #define Vc_FLOAT_V_SIZE 1
- #define Vc_INT_V_SIZE 1
- #define Vc_UINT_V_SIZE 1
- #define Vc_SHORT_V_SIZE 1
- #define Vc_USHORT_V_SIZE 1
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Scalar
- {
- template <typename T> using Vector = Vc::Vector<T, VectorAbi::Scalar>;
- typedef Vector<double> double_v;
- typedef Vector<float> float_v;
- typedef Vector<int> int_v;
- typedef Vector<unsigned int> uint_v;
- typedef Vector<short> short_v;
- typedef Vector<unsigned short> ushort_v;
- template <typename T> using Mask = Vc::Mask<T, VectorAbi::Scalar>;
- typedef Mask<double> double_m;
- typedef Mask<float> float_m;
- typedef Mask<int> int_m;
- typedef Mask<unsigned int> uint_m;
- typedef Mask<short> short_m;
- typedef Mask<unsigned short> ushort_m;
- template <typename T> struct is_vector : public std::false_type {};
- template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
- template <typename T> struct is_mask : public std::false_type {};
- template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
- }
- namespace Traits
- {
- template <typename T> struct is_simd_mask_internal<Scalar::Mask<T>>
- : public std::true_type {};
- template <class T> struct
- is_simd_vector_internal<Vector<T, VectorAbi::Scalar>>
- : public is_valid_vector_argument<T> {};
- }
- }
- #endif
- #ifndef VC_SCALAR_DETAIL_H_
- #define VC_SCALAR_DETAIL_H_
- #ifndef VC_SCALAR_MACROS_H_
- #define VC_SCALAR_MACROS_H_
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template<typename V, size_t Size, size_t VSize> struct InterleaveImpl;
- template<typename V, size_t VSize> struct InterleaveImpl<V, 1, VSize> {
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1)
- {
- data[i[0] + 0] = v0.data();
- data[i[0] + 1] = v1.data();
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2)
- {
- data[i[0] + 0] = v0.data();
- data[i[0] + 1] = v1.data();
- data[i[0] + 2] = v2.data();
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3)
- {
- data[i[0] + 0] = v0.data();
- data[i[0] + 1] = v1.data();
- data[i[0] + 2] = v2.data();
- data[i[0] + 3] = v3.data();
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4)
- {
- interleave(data, i, v0, v1, v2, v3);
- data[i[0] + 4] = v4.data();
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6, const typename V::AsArg v7)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6, v7);
- }
- template <typename I>
- static inline void deinterleave(typename V::EntryType const *const data, const I &i,
- V &v0, V &v1)
- {
- v0.data() = data[i[0] + 0];
- v1.data() = data[i[0] + 1];
- }
- template <typename I>
- static inline void deinterleave(typename V::EntryType const *const data, const I &i,
- V &v0, V &v1, V &v2)
- {
- v0.data() = data[i[0] + 0];
- v1.data() = data[i[0] + 1];
- v2.data() = data[i[0] + 2];
- }
- template <typename I>
- static inline void deinterleave(typename V::EntryType const *const data, const I &i,
- V &v0, V &v1, V &v2, V &v3)
- {
- v0.data() = data[i[0] + 0];
- v1.data() = data[i[0] + 1];
- v2.data() = data[i[0] + 2];
- v3.data() = data[i[0] + 3];
- }
- template <typename I>
- static inline void deinterleave(typename V::EntryType const *const data, const I &i,
- V &v0, V &v1, V &v2, V &v3, V &v4)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- v4.data() = data[i[0] + 4];
- }
- template <typename I>
- static inline void deinterleave(typename V::EntryType const *const data, const I &i,
- V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- deinterleave(data + 4, i, v4, v5);
- }
- template <typename I>
- static inline void deinterleave(typename V::EntryType const *const data, const I &i,
- V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- deinterleave(data + 4, i, v4, v5, v6);
- }
- template <typename I>
- static inline void deinterleave(typename V::EntryType const *const data, const I &i,
- V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6,
- V &v7)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- deinterleave(data + 4, i, v4, v5, v6, v7);
- }
- };
- }
- }
- #endif
- #ifndef VC_SCALAR_MASK_H_
- #define VC_SCALAR_MASK_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename T> class Mask<T, VectorAbi::Scalar>
- {
- friend class Mask< double, VectorAbi::Scalar>;
- friend class Mask< float, VectorAbi::Scalar>;
- friend class Mask< int32_t, VectorAbi::Scalar>;
- friend class Mask<uint32_t, VectorAbi::Scalar>;
- friend class Mask< int16_t, VectorAbi::Scalar>;
- friend class Mask<uint16_t, VectorAbi::Scalar>;
- public:
- using abi = VectorAbi::Scalar;
- static constexpr size_t Size = 1;
- static constexpr size_t MemoryAlignment = 1;
- static constexpr std::size_t size() { return 1; }
- typedef bool EntryType;
- using value_type = EntryType;
- using EntryReference = Vc::Detail::ElementReference<Mask>;
- using reference = EntryReference;
- typedef bool VectorEntryType;
- using VectorType = bool;
- using Vector = Scalar::Vector<T>;
- Vc_INTRINSIC Mask() = default;
- Vc_INTRINSIC explicit Mask(bool b) : m(b) {}
- Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : m(false) {}
- Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : m(true) {}
- Vc_INTRINSIC static Mask Zero() { return Mask(false); }
- Vc_INTRINSIC static Mask One() { return Mask(true); }
- template <typename U>
- Vc_INTRINSIC Mask(U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
- : m(rhs.m) {}
- #if Vc_IS_VERSION_1
- template <typename U>
- Vc_DEPRECATED(
- "use simd_cast instead of explicit type casting to convert between mask types")
- Vc_INTRINSIC_L
- explicit Mask(U &&rhs, Common::enable_if_mask_converts_explicitly<T, U> = nullarg)
- Vc_INTRINSIC_R;
- #endif
- Vc_ALWAYS_INLINE explicit Mask(const bool *mem) : m(mem[0]) {}
- template<typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags) : m(mem[0]) {}
- Vc_ALWAYS_INLINE void load(const bool *mem) { m = mem[0]; }
- template<typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags) { m = mem[0]; }
- Vc_ALWAYS_INLINE void store(bool *mem) const { *mem = m; }
- template<typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags) const { *mem = m; }
- Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return m == rhs.m; }
- Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return m != rhs.m; }
- Vc_ALWAYS_INLINE Mask operator&&(const Mask &rhs) const { return Mask(m && rhs.m); }
- Vc_ALWAYS_INLINE Mask operator& (const Mask &rhs) const { return Mask(m && rhs.m); }
- Vc_ALWAYS_INLINE Mask operator||(const Mask &rhs) const { return Mask(m || rhs.m); }
- Vc_ALWAYS_INLINE Mask operator| (const Mask &rhs) const { return Mask(m || rhs.m); }
- Vc_ALWAYS_INLINE Mask operator^ (const Mask &rhs) const { return Mask(m ^ rhs.m); }
- Vc_ALWAYS_INLINE Mask operator!() const { return Mask(!m); }
- Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { m &= rhs.m; return *this; }
- Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { m |= rhs.m; return *this; }
- Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { m ^= rhs.m; return *this; }
- Vc_ALWAYS_INLINE bool isFull () const { return m; }
- Vc_ALWAYS_INLINE bool isNotEmpty() const { return m; }
- Vc_ALWAYS_INLINE bool isEmpty() const { return !m; }
- Vc_ALWAYS_INLINE bool isMix () const { return false; }
- Vc_ALWAYS_INLINE bool data () const { return m; }
- Vc_ALWAYS_INLINE bool dataI() const { return m; }
- Vc_ALWAYS_INLINE bool dataD() const { return m; }
- private:
- friend reference;
- static Vc_INTRINSIC bool get(const Mask &o, int) noexcept { return o.m; }
- template <typename U>
- static Vc_INTRINSIC void set(Mask &o, int, U &&v) noexcept(
- noexcept(std::declval<bool &>() = std::declval<U>()))
- {
- o.m = std::forward<U>(v);
- }
- public:
- Vc_ALWAYS_INLINE reference operator[](size_t i) noexcept
- {
- Vc_ASSERT(i == 0); if (i) {}
- return {*this, 0};
- }
- Vc_ALWAYS_INLINE value_type operator[](size_t i) const noexcept
- {
- Vc_ASSERT(i == 0); if (i) {}
- return m;
- }
- Vc_ALWAYS_INLINE int count() const { return m ? 1 : 0; }
- Vc_ALWAYS_INLINE int firstOne() const { return 0; }
- Vc_ALWAYS_INLINE int toInt() const { return m ? 1 : 0; }
- template <typename G> static Vc_INTRINSIC Mask generate(G &&gen)
- {
- return Mask(gen(0));
- }
- Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const
- {
- if (amount == 0) {
- return *this;
- } else {
- return Zero();
- }
- }
- private:
- bool m;
- };
- template <typename T> constexpr size_t Mask<T, VectorAbi::Scalar>::Size;
- template <typename T> constexpr size_t Mask<T, VectorAbi::Scalar>::MemoryAlignment;
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- #define Vc_CURRENT_CLASS_NAME Vector
- template <typename T> class Vector<T, VectorAbi::Scalar>
- {
- static_assert(std::is_arithmetic<T>::value,
- "Vector<T> only accepts arithmetic builtin types as template parameter T.");
- public:
- using abi = VectorAbi::Scalar;
- using EntryType = T;
- using VectorEntryType = EntryType;
- using value_type = EntryType;
- using VectorType = EntryType;
- using vector_type = VectorType;
- using reference = Detail::ElementReference<Vector>;
- protected:
- VectorType m_data = VectorType();
- template <typename U> using V = Vector<U, abi>;
- public:
- typedef Scalar::Mask<T> Mask;
- using MaskType = Mask;
- using mask_type = Mask;
- typedef Mask MaskArgument;
- typedef Vector AsArg;
- Vc_ALWAYS_INLINE VectorType &data() { return m_data; }
- Vc_ALWAYS_INLINE const VectorType &data() const { return m_data; }
- static constexpr size_t Size = 1;
- static constexpr size_t MemoryAlignment = alignof(VectorType);
- using IndexType = fixed_size_simd<int, 1>;
- public:
- Vc_INTRINSIC Vector() = default;
- static constexpr std::size_t size() { return Size; }
- explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
- explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
- explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
- static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
- static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
- static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
- {
- return Vector(Vc::IndexesFromZero);
- }
- template <class G, int = 0,
- class = typename std::enable_if<std::is_convertible<
- decltype(std::declval<G>()(size_t())), value_type>::value>::type>
- explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
- {
- }
- static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R;
- template <typename U>
- Vc_INTRINSIC Vector(
- V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
- void *>::type = nullptr)
- : m_data(static_cast<EntryType>(x.data()))
- {
- }
- #if Vc_IS_VERSION_1
- template <typename U>
- Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
- "vector types") Vc_INTRINSIC
- explicit Vector(
- V<U> x,
- typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
- void *>::type = nullptr)
- : m_data(static_cast<EntryType>(x.data()))
- {
- }
- #endif
- Vc_INTRINSIC Vector(EntryType a) : m_data(a) {}
- template <typename U>
- Vc_INTRINSIC Vector(U a,
- typename std::enable_if<std::is_same<U, int>::value &&
- !std::is_same<U, EntryType>::value,
- void *>::type = nullptr)
- : Vector(static_cast<EntryType>(a))
- {
- }
- explicit Vc_INTRINSIC Vector(const EntryType *mem)
- {
- load(mem);
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
- {
- load(mem, flags);
- }
- template <typename U, typename Flags = DefaultLoadTag,
- typename = enable_if<
- (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
- sizeof(EntryType) >= sizeof(U)) &&
- std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
- {
- load<U, Flags>(x, flags);
- }
- Vc_INTRINSIC void load(const EntryType *mem)
- {
- load(mem, DefaultLoadTag());
- }
- template <typename Flags>
- Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
- load(const EntryType *mem, Flags flags)
- {
- load<EntryType, Flags>(mem, flags);
- }
- private:
- template <typename U, typename Flags>
- struct load_concept : public std::enable_if<
- (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
- sizeof(EntryType) >= sizeof(U)) &&
- std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
- {};
- public:
- template <typename U, typename Flags = DefaultLoadTag>
- Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
- template <
- typename U,
- typename Flags = DefaultStoreTag,
- typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
- template <
- typename U,
- typename Flags = DefaultStoreTag,
- typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
- Vc_INTRINSIC void store(EntryType *mem) const
- {
- store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
- {
- store<EntryType, Flags>(mem, flags);
- }
- Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
- {
- store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
- {
- store<EntryType, Flags>(mem, mask, flags);
- }
- Vc_ALWAYS_INLINE void setZero() { m_data = 0; }
- Vc_ALWAYS_INLINE void setZero(Mask k) { if (k.data()) m_data = 0; }
- Vc_ALWAYS_INLINE void setZeroInverted(Mask k) { if (!k.data()) m_data = 0; }
- Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
- Vc_INTRINSIC_L void setQnan(Mask m) Vc_INTRINSIC_R;
- #ifndef Vc_CURRENT_CLASS_NAME
- #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
- #endif
- private:
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
- MaskArgument mask);
- public:
- #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<MT, EntryType>::value, \
- "The memory pointer needs to point to a type that can be converted to the " \
- "EntryType of this SIMD vector type."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT, typename IT,
- typename = enable_if<Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
- private:
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes) const;
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
- public:
- #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<EntryType, MT>::value, \
- "The memory pointer needs to point to a type that the EntryType of this " \
- "SIMD vector type can be converted to."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes));
- }
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes), mask);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
- {
- scatter(args.address, args.indexes);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
- {
- scatter(args.address, args.indexes, mask);
- }
- #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
- Vc_ALWAYS_INLINE Vector &operator++() { ++m_data; return *this; }
- Vc_ALWAYS_INLINE Vector &operator--() { --m_data; return *this; }
- Vc_ALWAYS_INLINE Vector operator++(int) { return m_data++; }
- Vc_ALWAYS_INLINE Vector operator--(int) { return m_data--; }
- private:
- friend reference;
- Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
- {
- Vc_ASSERT(i == 0); if (i) {}
- return o.m_data;
- }
- template <typename U>
- Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
- noexcept(std::declval<value_type &>() = v))
- {
- Vc_ASSERT(i == 0); if (i) {}
- o.m_data = v;
- }
- public:
- Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
- {
- static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
- return {*this, int(index)};
- }
- Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
- {
- Vc_ASSERT(index == 0); if (index) {}
- return m_data;
- }
- Vc_ALWAYS_INLINE Mask operator!() const
- {
- return Mask(!m_data);
- }
- Vc_ALWAYS_INLINE Vector operator~() const
- {
- #ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
- static_assert(std::is_integral<T>::value, "bit-complement can only be used with Vectors of integral type");
- #endif
- return Vector(~m_data);
- }
- Vc_ALWAYS_INLINE Vector operator-() const
- {
- return -m_data;
- }
- Vc_INTRINSIC Vector Vc_PURE operator+() const { return *this; }
- #define Vc_OP(symbol) \
- Vc_ALWAYS_INLINE Vc_PURE Vector operator symbol(const Vector &x) const { return Vector(m_data symbol x.m_data); }
- Vc_ALL_SHIFTS(Vc_OP);
- #undef Vc_OP
- Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
- isNegative() const
- {
- return Vc::isnegative(*this);
- }
- Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &m) {
- if (m.data()) m_data = v.m_data;
- }
- template <typename V2>
- Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2
- staticCast() const
- {
- return V2(static_cast<typename V2::EntryType>(m_data));
- }
- template <typename V2>
- Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2
- reinterpretCast() const
- {
- typedef typename V2::EntryType AliasT2 Vc_MAY_ALIAS;
- return V2(*reinterpret_cast<const AliasT2 *>(&m_data));
- }
- Vc_ALWAYS_INLINE Common::WriteMaskedVector<Vector, Mask> operator()(Mask m)
- {
- return {*this, m};
- }
- Vc_ALWAYS_INLINE EntryType min() const { return m_data; }
- Vc_ALWAYS_INLINE EntryType max() const { return m_data; }
- Vc_ALWAYS_INLINE EntryType product() const { return m_data; }
- Vc_ALWAYS_INLINE EntryType sum() const { return m_data; }
- Vc_ALWAYS_INLINE Vector partialSum() const { return *this; }
- Vc_ALWAYS_INLINE EntryType min(Mask) const { return m_data; }
- Vc_ALWAYS_INLINE EntryType max(Mask) const { return m_data; }
- Vc_ALWAYS_INLINE EntryType product(Mask m) const
- {
- if (m.data()) {
- return m_data;
- } else {
- return EntryType(1);
- }
- }
- Vc_ALWAYS_INLINE EntryType sum(Mask m) const { if (m.data()) return m_data; return static_cast<EntryType>(0); }
- Vc_INTRINSIC Vector Vc_VDECL shifted(int amount, Vector shiftIn) const {
- Vc_ASSERT(amount >= -1 && amount <= 1);
- return amount == 0 ? *this : shiftIn;
- }
- Vc_INTRINSIC Vector shifted(int amount) const { return amount == 0 ? *this : Zero(); }
- Vc_INTRINSIC Vector rotated(int) const { return *this; }
- Vc_INTRINSIC Vector reversed() const { return *this; }
- Vc_INTRINSIC Vector sorted() const { return *this; }
- template <typename F> void callWithValuesSorted(F &&f) { f(m_data); }
- template <typename F> Vc_INTRINSIC void call(F &&f) const { f(m_data); }
- template <typename F> Vc_INTRINSIC void call(F &&f, Mask mask) const
- {
- if (mask.data()) {
- f(m_data);
- }
- }
- template <typename F> Vc_INTRINSIC Vector apply(F &&f) const { return Vector(f(m_data)); }
- template <typename F> Vc_INTRINSIC Vector apply(F &&f, Mask mask) const
- {
- if (mask.data()) {
- return Vector(f(m_data));
- } else {
- return *this;
- }
- }
- template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
- m_data = f(0);
- }
- Vc_INTRINSIC void fill(EntryType (&f)()) {
- m_data = f();
- }
- template <typename G> static Vc_INTRINSIC Vector generate(G gen)
- {
- return gen(0);
- }
- Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector Vc_VDECL
- copySign(Vector x) const
- {
- return Vc::copysign(*this, x);
- }
- Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
- {
- return Vc::exponent(*this);
- }
- Vc_INTRINSIC Vector Vc_VDECL interleaveLow(Vector) const { return *this; }
- Vc_INTRINSIC Vector Vc_VDECL interleaveHigh(Vector x) const { return x; }
- };
- #undef Vc_CURRENT_CLASS_NAME
- template <typename T> constexpr size_t Vector<T, VectorAbi::Scalar>::Size;
- template <typename T> constexpr size_t Vector<T, VectorAbi::Scalar>::MemoryAlignment;
- #define Vc_OP(symbol) \
- template <typename T, typename U, \
- typename = decltype(std::declval<T &>() symbol## = std::declval<T>())> \
- Vc_INTRINSIC enable_if<std::is_convertible<U, Vector<T, VectorAbi::Scalar>>::value, \
- Vector<T, VectorAbi::Scalar>> \
- &operator symbol##=(Vector<T, VectorAbi::Scalar> &lhs, U &&rhs) \
- { \
- lhs.data() symbol## = Vector<T, VectorAbi::Scalar>(std::forward<U>(rhs)).data(); \
- return lhs; \
- }
- Vc_ALL_SHIFTS(Vc_OP);
- #undef Vc_OP
- #define Vc_CONDITIONAL_ASSIGN(name_,op_) \
- template <Operator O, typename T, typename M, typename U> \
- Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
- Vector<T, VectorAbi::Scalar> &lhs, M &&mask, U &&rhs) \
- { \
- if (mask.isFull()) { \
- lhs op_ std::forward<U>(rhs); \
- } \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_CONDITIONAL_ASSIGN( Assign, =);
- Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
- Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
- Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
- Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
- Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
- Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
- Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
- Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
- Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
- Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
- #undef Vc_CONDITIONAL_ASSIGN
- #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
- template <Operator O, typename T, typename M> \
- Vc_INTRINSIC enable_if<O == Operator::name_, Vector<T, VectorAbi::Scalar>> \
- conditional_assign(Vector<T, VectorAbi::Scalar> &lhs, M &&mask) \
- { \
- return mask.isFull() ? (expr_) : lhs; \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs++);
- Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs);
- Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs--);
- Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs);
- #undef Vc_CONDITIONAL_ASSIGN
- }
- #include <cmath>
- #ifndef VC_COMMON_CONST_DATA_H_
- #define VC_COMMON_CONST_DATA_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- alignas(64) extern unsigned int RandomState[];
- alignas(32) extern const unsigned int AllBitsSet[8];
- }
- }
- #endif
- #ifndef VC_COMMON_WHERE_H_
- #define VC_COMMON_WHERE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace WhereImpl
- {
- template<typename _Mask, typename _LValue> struct MaskedLValue
- {
- typedef _Mask Mask;
- typedef _LValue LValue;
- const Mask &mask;
- LValue &lhs;
- constexpr MaskedLValue(const Mask &m, LValue &l) : mask(m), lhs(l) {}
- MaskedLValue(const MaskedLValue &) = delete;
- #ifndef __cpp_guaranteed_copy_elision
- constexpr MaskedLValue(MaskedLValue &&) = default;
- #endif
- template<typename T> Vc_ALWAYS_INLINE void operator =(T &&rhs) { conditional_assign<Operator:: Assign>(lhs, mask, std::forward<T>(rhs)); }
- template<typename T> Vc_ALWAYS_INLINE void operator +=(T &&rhs) { conditional_assign<Operator:: PlusAssign>(lhs, mask, std::forward<T>(rhs)); }
- template<typename T> Vc_ALWAYS_INLINE void operator -=(T &&rhs) { conditional_assign<Operator:: MinusAssign>(lhs, mask, std::forward<T>(rhs)); }
- template<typename T> Vc_ALWAYS_INLINE void operator *=(T &&rhs) { conditional_assign<Operator:: MultiplyAssign>(lhs, mask, std::forward<T>(rhs)); }
- template<typename T> Vc_ALWAYS_INLINE void operator /=(T &&rhs) { conditional_assign<Operator:: DivideAssign>(lhs, mask, std::forward<T>(rhs)); }
- template<typename T> Vc_ALWAYS_INLINE void operator %=(T &&rhs) { conditional_assign<Operator:: RemainderAssign>(lhs, mask, std::forward<T>(rhs)); }
- template<typename T> Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { conditional_assign<Operator:: XorAssign>(lhs, mask, std::forward<T>(rhs)); }
- template<typename T> Vc_ALWAYS_INLINE void operator &=(T &&rhs) { conditional_assign<Operator:: AndAssign>(lhs, mask, std::forward<T>(rhs)); }
- template<typename T> Vc_ALWAYS_INLINE void operator |=(T &&rhs) { conditional_assign<Operator:: OrAssign>(lhs, mask, std::forward<T>(rhs)); }
- template<typename T> Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { conditional_assign<Operator:: LeftShiftAssign>(lhs, mask, std::forward<T>(rhs)); }
- template<typename T> Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { conditional_assign<Operator::RightShiftAssign>(lhs, mask, std::forward<T>(rhs)); }
- Vc_ALWAYS_INLINE void operator++() { conditional_assign<Operator:: PreIncrement>(lhs, mask); }
- Vc_ALWAYS_INLINE void operator++(int) { conditional_assign<Operator::PostIncrement>(lhs, mask); }
- Vc_ALWAYS_INLINE void operator--() { conditional_assign<Operator:: PreDecrement>(lhs, mask); }
- Vc_ALWAYS_INLINE void operator--(int) { conditional_assign<Operator::PostDecrement>(lhs, mask); }
- template <class T, class IV, class S>
- Vc_INTRINSIC void operator=(Common::SubscriptOperation<T, IV, S, true> &&rhs)
- {
- lhs.gather(std::move(rhs).gatherArguments(), mask);
- }
- template <class T, class IV, class S>
- void operator+=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
- template <class T, class IV, class S>
- void operator-=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
- template <class T, class IV, class S>
- void operator*=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
- template <class T, class IV, class S>
- void operator/=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
- template <class T, class IV, class S>
- void operator%=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
- template <class T, class IV, class S>
- void operator^=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
- template <class T, class IV, class S>
- void operator&=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
- template <class T, class IV, class S>
- void operator|=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
- template <class T, class IV, class S>
- void operator<<=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
- template <class T, class IV, class S>
- void operator>>=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
- };
- template <typename _Mask, typename T_, typename I_, typename S_>
- struct MaskedLValue<_Mask, Common::SubscriptOperation<T_, I_, S_, true>>
- {
- typedef _Mask Mask;
- typedef Common::SubscriptOperation<T_, I_, S_, true> SO;
- const Mask &mask;
- SO &lhs;
- template <typename T> using Decay = typename std::decay<T>::type;
- constexpr MaskedLValue(const Mask &m, SO &&l) : mask(m), lhs(l) {}
- MaskedLValue(const MaskedLValue &) = delete;
- #ifndef __cpp_guaranteed_copy_elision
- constexpr MaskedLValue(MaskedLValue &&) = default;
- #endif
- template <class T> Vc_ALWAYS_INLINE void operator=(T &&rhs) &&
- {
- std::forward<T>(rhs).scatter(std::move(lhs).scatterArguments(), mask);
- }
- };
- template<typename _LValue> struct MaskedLValue<bool, _LValue>
- {
- typedef bool Mask;
- typedef _LValue LValue;
- const Mask &mask;
- LValue &lhs;
- constexpr MaskedLValue(const Mask &m, LValue &l) : mask(m), lhs(l) {}
- MaskedLValue(const MaskedLValue &) = delete;
- constexpr MaskedLValue(MaskedLValue &&) = default;
- template<typename T> Vc_ALWAYS_INLINE void operator =(T &&rhs) { if (mask) lhs = std::forward<T>(rhs); }
- template<typename T> Vc_ALWAYS_INLINE void operator +=(T &&rhs) { if (mask) lhs += std::forward<T>(rhs); }
- template<typename T> Vc_ALWAYS_INLINE void operator -=(T &&rhs) { if (mask) lhs -= std::forward<T>(rhs); }
- template<typename T> Vc_ALWAYS_INLINE void operator *=(T &&rhs) { if (mask) lhs *= std::forward<T>(rhs); }
- template<typename T> Vc_ALWAYS_INLINE void operator /=(T &&rhs) { if (mask) lhs /= std::forward<T>(rhs); }
- template<typename T> Vc_ALWAYS_INLINE void operator %=(T &&rhs) { if (mask) lhs %= std::forward<T>(rhs); }
- template<typename T> Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { if (mask) lhs ^= std::forward<T>(rhs); }
- template<typename T> Vc_ALWAYS_INLINE void operator &=(T &&rhs) { if (mask) lhs &= std::forward<T>(rhs); }
- template<typename T> Vc_ALWAYS_INLINE void operator |=(T &&rhs) { if (mask) lhs |= std::forward<T>(rhs); }
- template<typename T> Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { if (mask) lhs <<= std::forward<T>(rhs); }
- template<typename T> Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { if (mask) lhs >>= std::forward<T>(rhs); }
- Vc_ALWAYS_INLINE void operator++() { if (mask) ++lhs; }
- Vc_ALWAYS_INLINE void operator++(int) { if (mask) lhs++; }
- Vc_ALWAYS_INLINE void operator--() { if (mask) --lhs; }
- Vc_ALWAYS_INLINE void operator--(int) { if (mask) lhs--; }
- };
- template<typename _Mask> struct WhereMask
- {
- typedef _Mask Mask;
- const Mask &mask;
- constexpr WhereMask(const Mask &m) : mask(m) {}
- WhereMask(const WhereMask &) = delete;
- template <typename T, typename I, typename S>
- constexpr Vc_WARN_UNUSED_RESULT
- MaskedLValue<Mask, Common::SubscriptOperation<T, I, S, true>>
- operator|(Common::SubscriptOperation<T, I, S, true> &&lhs) const
- {
- static_assert(!std::is_const<T>::value,
- "masked scatter to constant memory not possible.");
- return {mask, std::move(lhs)};
- }
- template<typename T> constexpr Vc_WARN_UNUSED_RESULT MaskedLValue<Mask, T> operator|(T &&lhs) const
- {
- static_assert(std::is_lvalue_reference<T>::value, "Syntax error: Incorrect use of Vc::where. Maybe operator precedence got you by surprise. Examples of correct usage:\n"
- " Vc::where(x < 2) | x += 1;\n"
- " (Vc::where(x < 2) | x)++;\n"
- " Vc::where(x < 2)(x) += 1;\n"
- " Vc::where(x < 2)(x)++;\n"
- );
- return { mask, lhs };
- }
- template <class T,
- class = decltype(std::declval<T>() = std::declval<const T &>())>
- constexpr Vc_WARN_UNUSED_RESULT MaskedLValue<Mask, T> operator()(T &&lhs) const
- {
- return operator|(std::forward<T>(lhs));
- }
- };
- }
- template<typename M> constexpr Vc_WARN_UNUSED_RESULT WhereImpl::WhereMask<M> where(const M &mask)
- {
- return { mask };
- }
- template <class M, class V>
- constexpr Vc_WARN_UNUSED_RESULT WhereImpl::MaskedLValue<M, V> where(const M &mask,
- V &value)
- {
- return {mask, value};
- }
- template <class M, class T, class IT, class Scale>
- constexpr Vc_WARN_UNUSED_RESULT
- WhereImpl::MaskedLValue<M, Common::SubscriptOperation<T, IT, Scale, true>>
- where(const M &mask, Common::SubscriptOperation<T, IT, Scale, true> &&value)
- {
- return {mask, std::move(value)};
- }
- template<typename M> constexpr Vc_WARN_UNUSED_RESULT WhereImpl::WhereMask<M> _if(const M &m)
- {
- return { m };
- }
- }
- #endif
- #ifndef VC_COMMON_TRANSPOSE_H_
- #define VC_COMMON_TRANSPOSE_H_
- #include <tuple>
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- template <typename... Inputs> struct TransposeProxy
- {
- TransposeProxy(const Inputs &... inputs) : in{inputs...} {}
- std::tuple<const Inputs &...> in;
- };
- template <int LhsLength, size_t RhsLength> struct TransposeTag {
- };
- }
- template <typename... Vs> Common::TransposeProxy<Vs...> transpose(Vs... vs)
- {
- return {vs...};
- }
- }
- #endif
- #ifndef VC_SCALAR_OPERATORS_H_
- #define VC_SCALAR_OPERATORS_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- #define Vc_OP(op_) \
- template <typename T> \
- Vc_INTRINSIC Scalar::Mask<T> operator op_(Scalar::Vector<T> a, Scalar::Vector<T> b) \
- { \
- return Scalar::Mask<T>(a.data() op_ b.data()); \
- }
- Vc_ALL_COMPARES(Vc_OP);
- #undef Vc_OP
- #define Vc_OP(symbol) \
- template <typename T> \
- Vc_INTRINSIC enable_if<std::is_integral<T>::value, Scalar::Vector<T>> \
- operator symbol(Scalar::Vector<T> a, Scalar::Vector<T> b) \
- { \
- return a.data() symbol b.data(); \
- } \
- template <typename T> \
- Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, Scalar::Vector<T>> \
- operator symbol(Scalar::Vector<T> &lhs, Scalar::Vector<T> rhs) \
- { \
- using uinta = \
- MayAlias<typename std::conditional<sizeof(T) == sizeof(int), unsigned int, \
- unsigned long long>::type>; \
- uinta *left = reinterpret_cast<uinta *>(&lhs.data()); \
- const uinta *right = reinterpret_cast<const uinta *>(&rhs.data()); \
- *left symbol## = *right; \
- return lhs; \
- }
- Vc_ALL_BINARY(Vc_OP);
- #undef Vc_OP
- template <typename T>
- Vc_INTRINSIC Scalar::Vector<T> operator+(Scalar::Vector<T> a, Scalar::Vector<T> b)
- {
- return a.data() + b.data();
- }
- template <typename T>
- Vc_INTRINSIC Scalar::Vector<T> operator-(Scalar::Vector<T> a, Scalar::Vector<T> b)
- {
- return a.data() - b.data();
- }
- template <typename T>
- Vc_INTRINSIC Scalar::Vector<T> operator*(Scalar::Vector<T> a, Scalar::Vector<T> b)
- {
- return a.data() * b.data();
- }
- template <typename T>
- Vc_INTRINSIC Scalar::Vector<T> operator/(Scalar::Vector<T> a, Scalar::Vector<T> b)
- {
- return a.data() / b.data();
- }
- template <typename T>
- Vc_INTRINSIC Scalar::Vector<T> operator%(Scalar::Vector<T> a, Scalar::Vector<T> b)
- {
- return a.data() % b.data();
- }
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename T>
- Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerZero)
- : m_data(0)
- {
- }
- template <typename T>
- Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerOne)
- : m_data(1)
- {
- }
- template <typename T>
- Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerIndexesFromZero)
- : m_data(0)
- {
- }
- template <typename T>
- template <typename U, typename Flags>
- Vc_INTRINSIC typename Vector<T, VectorAbi::Scalar>::
- #ifndef Vc_MSVC
- template
- #endif
- load_concept<U, Flags>::type Vector<T, VectorAbi::Scalar>::load(const U *mem, Flags)
- {
- m_data = mem[0];
- }
- template <typename T>
- template <typename U, typename Flags, typename>
- Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::store(U *mem, Flags) const
- {
- mem[0] = m_data;
- }
- template <typename T>
- template <typename U, typename Flags, typename>
- Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::store(U *mem, Mask mask, Flags) const
- {
- if (mask.data())
- mem[0] = m_data;
- }
- template <typename T>
- template <class MT, class IT, int Scale>
- Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::gatherImplementation(
- const Common::GatherArguments<MT, IT, Scale> &args)
- {
- m_data = args.address[Scale * args.indexes[0]];
- }
- template <typename T>
- template <class MT, class IT, int Scale>
- Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::gatherImplementation(
- const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
- {
- if (mask.data()) {
- m_data = args.address[Scale * args.indexes[0]];
- }
- }
- template <typename T>
- template <typename MT, typename IT>
- Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::scatterImplementation(MT *mem,
- IT &&indexes)
- const
- {
- mem[indexes[0]] = m_data;
- }
- template <typename T>
- template <typename MT, typename IT>
- Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::scatterImplementation(
- MT *mem, IT &&indexes, MaskArgument mask) const
- {
- if (mask.data()) {
- mem[indexes[0]] = m_data;
- }
- }
- Vc_INTRINSIC Vc_CONST Scalar::float_v exponent(Scalar::float_v x)
- {
- Vc_ASSERT(x.data() >= 0.f);
- union { float f; int i; } value;
- value.f = x.data();
- return Scalar::float_v(static_cast<float>((value.i >> 23) - 0x7f));
- }
- Vc_INTRINSIC Vc_CONST Scalar::double_v Vc_VDECL exponent(Scalar::double_v x)
- {
- Vc_ASSERT(x.data() >= 0.);
- union { double f; long long i; } value;
- value.f = x.data();
- return Scalar::double_v(static_cast<double>((value.i >> 52) - 0x3ff));
- }
- static Vc_ALWAYS_INLINE void _doRandomStep(Scalar::uint_v &state0, Scalar::uint_v &state1)
- {
- using Scalar::uint_v;
- state0.load(&Common::RandomState[0]);
- state1.load(&Common::RandomState[uint_v::Size]);
- Detail::operator+(Detail::operator*(state1, uint_v(0xdeece66du)),
- uint_v(11))
- .store(&Common::RandomState[uint_v::Size]);
- uint_v(Detail::operator+(Detail::operator*(state0, uint_v(0xdeece66du)), uint_v(11))
- .data() ^
- (state1.data() >> 16))
- .store(&Common::RandomState[0]);
- }
- template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Scalar> Vector<T, VectorAbi::Scalar>::Random()
- {
- Scalar::uint_v state0, state1;
- _doRandomStep(state0, state1);
- return Vector<T, VectorAbi::Scalar>(static_cast<EntryType>(state0.data()));
- }
- template<> Vc_INTRINSIC Scalar::float_v Scalar::float_v::Random()
- {
- Scalar::uint_v state0, state1;
- _doRandomStep(state0, state1);
- union { unsigned int i; float f; } x;
- x.i = (state0.data() & 0x0fffffffu) | 0x3f800000u;
- return Scalar::float_v(x.f - 1.f);
- }
- template<> Vc_INTRINSIC Scalar::double_v Scalar::double_v::Random()
- {
- typedef unsigned long long uint64 Vc_MAY_ALIAS;
- uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
- state0 = (state0 * 0x5deece66dull + 11) & 0x000fffffffffffffull;
- *reinterpret_cast<uint64 *>(&Common::RandomState[8]) = state0;
- union { unsigned long long i; double f; } x;
- x.i = state0 | 0x3ff0000000000000ull;
- return Scalar::double_v(x.f - 1.);
- }
- Vc_INTRINSIC Vc_CONST Scalar::float_m isnegative(Scalar::float_v x)
- {
- static_assert(sizeof(float) == sizeof(unsigned int),
- "This code assumes float and unsigned int have the same number of "
- "Bytes. Please file a bug report if this is a problem.");
- union { float f; unsigned int i; } u;
- u.f = x.data();
- return Scalar::float_m(0u != (u.i & 0x80000000u));
- }
- Vc_INTRINSIC Vc_CONST Scalar::double_m Vc_VDECL isnegative(Scalar::double_v x)
- {
- static_assert(sizeof(double) == sizeof(unsigned long long),
- "This code assumes double and unsigned long long have the same number "
- "of Bytes. Please file a bug report if this is a problem.");
- union { double d; unsigned long long l; } u;
- u.d = x.data();
- return Scalar::double_m(0ull != (u.l & 0x8000000000000000ull));
- }
- template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::setQnan()
- {
- union { float f; unsigned int i; } u;
- u.i = 0xffffffffu;
- m_data = u.f;
- }
- template<> Vc_INTRINSIC void Scalar::double_v::setQnan()
- {
- union { double d; unsigned long long l; } u;
- u.l = 0xffffffffffffffffull;
- m_data = u.d;
- }
- template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::setQnan(Mask m)
- {
- if (m.data()) {
- setQnan();
- }
- }
- template<> Vc_INTRINSIC void Scalar::double_v::setQnan(Scalar::double_v::Mask m)
- {
- if (m.data()) {
- setQnan();
- }
- }
- namespace Common
- {
- Vc_ALWAYS_INLINE void transpose_impl(TransposeTag<1, 1>, Scalar::float_v *Vc_RESTRICT r[],
- const TransposeProxy<Scalar::float_v> &proxy)
- {
- *r[0] = std::get<0>(proxy.in).data();
- }
- }
- }
- #ifndef VC_SCALAR_SIMD_CAST_H_
- #define VC_SCALAR_SIMD_CAST_H_
- #ifndef VC_COMMON_SIMD_CAST_H_
- #define VC_COMMON_SIMD_CAST_H_
- #include <type_traits>
- template <class> void simd_cast();
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(From &&x, enable_if<std::is_same<To, Traits::decay<From>>::value> = nullarg)
- {
- return std::forward<From>(x);
- }
- template <typename To> Vc_INTRINSIC Vc_CONST To simd_cast() { return To(); }
- }
- #endif
- #ifndef VC_SCALAR_TYPE_TRAITS_H_
- #define VC_SCALAR_TYPE_TRAITS_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Scalar
- {
- namespace Traits
- {
- template <typename T> struct is_vector : public std::false_type {};
- template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
- template <typename T> struct is_mask : public std::false_type {};
- template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
- }
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(Scalar::Vector<From> x, enable_if<Scalar::is_vector<To>::value> = nullarg)
- {
- return static_cast<To>(x.data());
- }
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(Scalar::Mask<From> x, enable_if<Scalar::is_mask<To>::value> = nullarg)
- {
- return static_cast<To>(x.data());
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast(
- T &&x,
- enable_if<Traits::is_simd_vector<T>::value && Scalar::is_vector<Return>::value> = nullarg)
- {
- return Return(x[offset]);
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<offset == 0 && Traits::is_simd_vector<Return>::value &&
- !Scalar::is_vector<Return>::value,
- Return>
- simd_cast(Scalar::Vector<T> x)
- {
- Return r{};
- r[0] = static_cast<typename Return::EntryType>(x.data());
- return r;
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast(
- T &&x,
- enable_if<Traits::is_simd_mask<T>::value && Scalar::is_mask<Return>::value> = nullarg)
- {
- return Return(bool(x[offset]));
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<
- offset == 0 && Traits::is_simd_mask<Return>::value && !Scalar::is_mask<Return>::value,
- Return>
- simd_cast(Scalar::Mask<T> x)
- {
- Return r(false);
- r[0] = x[0];
- return r;
- }
- }
- #endif
- #endif
- #if defined(Vc_IMPL_SSE)
- #ifndef VC_SSE_VECTOR_H_
- #define VC_SSE_VECTOR_H_
- #ifndef VC_SSE_INTRINSICS_H_
- #define VC_SSE_INTRINSICS_H_
- #ifdef Vc_MSVC
- #include <intrin.h>
- #else
- #include <x86intrin.h>
- #endif
- #ifndef VC_COMMON_STORAGE_H_
- #define VC_COMMON_STORAGE_H_
- #ifndef VC_COMMON_ALIASINGENTRYHELPER_H_
- #define VC_COMMON_ALIASINGENTRYHELPER_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- template<class StorageType> class AliasingEntryHelper
- {
- private:
- typedef typename StorageType::EntryType T;
- #ifdef Vc_ICC
- StorageType *const m_storage;
- const int m_index;
- public:
- Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {}
- Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &) = default;
- Vc_ALWAYS_INLINE AliasingEntryHelper(AliasingEntryHelper &&) = default;
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
- m_storage->assign(m_index, rhs);
- return *this;
- }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_storage->assign(m_index, x); return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator +=(T x) { m_storage->assign(m_index, m_storage->m(m_index) + x); return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator -=(T x) { m_storage->assign(m_index, m_storage->m(m_index) - x); return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator /=(T x) { m_storage->assign(m_index, m_storage->m(m_index) / x); return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator *=(T x) { m_storage->assign(m_index, m_storage->m(m_index) * x); return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator |=(T x) { m_storage->assign(m_index, m_storage->m(m_index) | x); return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator &=(T x) { m_storage->assign(m_index, m_storage->m(m_index) & x); return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator ^=(T x) { m_storage->assign(m_index, m_storage->m(m_index) ^ x); return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; }
- #define m_data m_storage->read(m_index)
- #else
- typedef T A Vc_MAY_ALIAS;
- A &m_data;
- public:
- template<typename T2>
- Vc_ALWAYS_INLINE AliasingEntryHelper(T2 &d) : m_data(reinterpret_cast<A &>(d)) {}
- Vc_ALWAYS_INLINE AliasingEntryHelper(A &d) : m_data(d) {}
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
- m_data = rhs.m_data;
- return *this;
- }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_data = x; return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator+=(T x) { m_data += x; return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator-=(T x) { m_data -= x; return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator/=(T x) { m_data /= x; return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator*=(T x) { m_data *= x; return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator|=(T x) { m_data |= x; return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator&=(T x) { m_data &= x; return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator^=(T x) { m_data ^= x; return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; }
- Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; }
- #endif
- Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; }
- Vc_ALWAYS_INLINE Vc_PURE bool operator==(T x) const { return static_cast<T>(m_data) == x; }
- Vc_ALWAYS_INLINE Vc_PURE bool operator!=(T x) const { return static_cast<T>(m_data) != x; }
- Vc_ALWAYS_INLINE Vc_PURE bool operator<=(T x) const { return static_cast<T>(m_data) <= x; }
- Vc_ALWAYS_INLINE Vc_PURE bool operator>=(T x) const { return static_cast<T>(m_data) >= x; }
- Vc_ALWAYS_INLINE Vc_PURE bool operator< (T x) const { return static_cast<T>(m_data) < x; }
- Vc_ALWAYS_INLINE Vc_PURE bool operator> (T x) const { return static_cast<T>(m_data) > x; }
- Vc_ALWAYS_INLINE Vc_PURE T operator-() const { return -static_cast<T>(m_data); }
- Vc_ALWAYS_INLINE Vc_PURE T operator~() const { return ~static_cast<T>(m_data); }
- Vc_ALWAYS_INLINE Vc_PURE T operator+(T x) const { return static_cast<T>(m_data) + x; }
- Vc_ALWAYS_INLINE Vc_PURE T operator-(T x) const { return static_cast<T>(m_data) - x; }
- Vc_ALWAYS_INLINE Vc_PURE T operator/(T x) const { return static_cast<T>(m_data) / x; }
- Vc_ALWAYS_INLINE Vc_PURE T operator*(T x) const { return static_cast<T>(m_data) * x; }
- Vc_ALWAYS_INLINE Vc_PURE T operator|(T x) const { return static_cast<T>(m_data) | x; }
- Vc_ALWAYS_INLINE Vc_PURE T operator&(T x) const { return static_cast<T>(m_data) & x; }
- Vc_ALWAYS_INLINE Vc_PURE T operator^(T x) const { return static_cast<T>(m_data) ^ x; }
- Vc_ALWAYS_INLINE Vc_PURE T operator%(T x) const { return static_cast<T>(m_data) % x; }
- #ifdef m_data
- #undef m_data
- #endif
- };
- }
- }
- #endif
- #ifndef VC_COMMON_MASKENTRY_H_
- #define VC_COMMON_MASKENTRY_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- namespace
- {
- template<size_t Bytes> struct MaskBoolStorage;
- template<> struct MaskBoolStorage<1> { typedef std::int8_t type; };
- template<> struct MaskBoolStorage<2> { typedef std::int16_t type; };
- template<> struct MaskBoolStorage<4> { typedef std::int32_t type; };
- template<> struct MaskBoolStorage<8> { typedef std::int64_t type; };
- }
- template<size_t Bytes> class MaskBool
- {
- typedef typename MaskBoolStorage<Bytes>::type storage_type Vc_MAY_ALIAS;
- storage_type data;
- public:
- constexpr MaskBool(bool x) noexcept : data(x ? -1 : 0) {}
- Vc_ALWAYS_INLINE MaskBool &operator=(bool x) noexcept { data = x ? -1 : 0; return *this; }
- template <typename T, typename = enable_if<(!std::is_same<T, bool>::value &&
- std::is_fundamental<T>::value)>>
- Vc_ALWAYS_INLINE MaskBool &operator=(T x) noexcept
- {
- data = reinterpret_cast<const storage_type &>(x);
- return *this;
- }
- Vc_ALWAYS_INLINE MaskBool(const MaskBool &) noexcept = default;
- Vc_ALWAYS_INLINE MaskBool &operator=(const MaskBool &) noexcept = default;
- template <typename T, typename = enable_if<(std::is_same<T, bool>::value ||
- (std::is_fundamental<T>::value &&
- sizeof(storage_type) == sizeof(T)))>>
- constexpr operator T() const noexcept
- {
- return std::is_same<T, bool>::value ? T((data & 1) != 0) : aliasing_cast<T>(data);
- }
- } Vc_MAY_ALIAS;
- template <typename A,
- typename B,
- typename std::enable_if<
- std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
- int>::type = 0>
- constexpr bool operator==(A &&a, B &&b)
- {
- return static_cast<bool>(a) == static_cast<bool>(b);
- }
- template <typename A,
- typename B,
- typename std::enable_if<
- std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
- int>::type = 0>
- constexpr bool operator!=(A &&a, B &&b)
- {
- return static_cast<bool>(a) != static_cast<bool>(b);
- }
- }
- }
- #endif
- #ifdef Vc_IMPL_AVX
- #ifndef VC_AVX_INTRINSICS_H_
- #define VC_AVX_INTRINSICS_H_
- extern "C" {
- #include <immintrin.h>
- #if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC)
- #include <x86intrin.h>
- #endif
- }
- #ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H_
- #define VC_COMMON_FIX_CLANG_EMMINTRIN_H_
- #if (defined Vc_CLANG && Vc_CLANG < 0x30700) || (defined Vc_APPLECLANG && Vc_APPLECLANG < 0x70000)
- #ifdef _mm_slli_si128
- #undef _mm_slli_si128
- #define _mm_slli_si128(a,count) __extension__ ({ \
- (__m128i)__builtin_ia32_pslldqi128((__m128i)(a), (count)*8); })
- #endif
- #ifdef _mm_srli_si128
- #undef _mm_srli_si128
- #define _mm_srli_si128(a,count) __extension__ ({ \
- (__m128i)__builtin_ia32_psrldqi128((__m128i)(a), (count)*8); })
- #endif
- #ifdef _mm_shuffle_epi32
- #undef _mm_shuffle_epi32
- #define _mm_shuffle_epi32(a,imm) __extension__ ({ \
- (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), (__v4si) _mm_set1_epi32(0), \
- (imm) & 0x3, ((imm) & 0xc) >> 2, \
- ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
- #endif
- #ifdef _mm_shufflelo_epi16
- #undef _mm_shufflelo_epi16
- #define _mm_shufflelo_epi16(a,imm) __extension__ ({ \
- (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
- (imm) & 0x3, ((imm) & 0xc) >> 2, \
- ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
- 4, 5, 6, 7); })
- #endif
- #ifdef _mm_shufflehi_epi16
- #undef _mm_shufflehi_epi16
- #define _mm_shufflehi_epi16(a,imm) __extension__ ({ \
- (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
- 0, 1, 2, 3, \
- 4 + (((imm) & 0x03) >> 0), \
- 4 + (((imm) & 0x0c) >> 2), \
- 4 + (((imm) & 0x30) >> 4), \
- 4 + (((imm) & 0xc0) >> 6)); })
- #endif
- #ifdef _mm_shuffle_pd
- #undef _mm_shuffle_pd
- #define _mm_shuffle_pd(a,b,i) __extension__ ({ \
- __builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); })
- #endif
- #endif
- #endif
- #ifndef VC_AVX_CONST_DATA_H_
- #define VC_AVX_CONST_DATA_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace AVX
- {
- alignas(64) extern const unsigned int _IndexesFromZero32[ 8];
- alignas(16) extern const unsigned short _IndexesFromZero16[16];
- alignas(16) extern const unsigned char _IndexesFromZero8 [32];
- struct alignas(64) c_general
- {
- static const float oneFloat;
- static const unsigned int absMaskFloat[2];
- static const unsigned int signMaskFloat[2];
- static const unsigned int highMaskFloat;
- static const unsigned short minShort[2];
- static const unsigned short one16[2];
- static const float _2power31;
- static const double oneDouble;
- static const unsigned long long frexpMask;
- static const unsigned long long highMaskDouble;
- };
- template<typename T> struct c_trig
- {
- alignas(64) static const T data[];
- };
- #ifndef Vc_MSVC
- template <> alignas(64) const float c_trig<float>::data[];
- template <> alignas(64) const double c_trig<double>::data[];
- #endif
- template<typename T> struct c_log
- {
- typedef float floatAlias Vc_MAY_ALIAS;
- static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast<const floatAlias *>(&data[i]); }
- alignas(64) static const unsigned int data[21];
- };
- #ifndef Vc_MSVC
- template<> alignas(64) const unsigned int c_log<float>::data[21];
- #endif
- template<> struct c_log<double>
- {
- enum VectorSize { Size = 16 / sizeof(double) };
- typedef double doubleAlias Vc_MAY_ALIAS;
- static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast<const doubleAlias *>(&data[i]); }
- alignas(64) static const unsigned long long data[21];
- };
- }
- }
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace AVX2
- {
- using AVX::_IndexesFromZero8;
- using AVX::_IndexesFromZero16;
- using AVX::_IndexesFromZero32;
- using AVX::c_general;
- using AVX::c_trig;
- using AVX::c_log;
- }
- }
- #endif
- #include <cstdlib>
- #if (defined Vc_CLANG && Vc_CLANG >= 0x30900 && Vc_CLANG < 0x70000)
- #ifdef _mm256_permute2f128_si256
- #undef _mm256_permute2f128_si256
- #define _mm256_permute2f128_si256(V1,V2,M) __extension__ ({ \
- (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
- (__v8si)(__m256i)(V2), (char)(M)); })
- #endif
- #ifdef _mm256_permute2f128_ps
- #undef _mm256_permute2f128_ps
- #define _mm256_permute2f128_ps(V1,V2,M) __extension__ ({ \
- (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
- (__v8sf)(__m256)(V2), (char)(M)); })
- #endif
- #ifdef _mm256_permute2x128_si256
- #undef _mm256_permute2x128_si256
- #define _mm256_permute2x128_si256(V1,V2,M) __extension__ ({ \
- (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (char)(M)); })
- #endif
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace AvxIntrinsics
- {
- using AVX::c_general;
- using AVX::_IndexesFromZero32;
- using AVX::_IndexesFromZero16;
- using AVX::_IndexesFromZero8;
- typedef __m128 m128 ;
- typedef __m128d m128d;
- typedef __m128i m128i;
- typedef __m256 m256 ;
- typedef __m256d m256d;
- typedef __m256i m256i;
- #ifdef Vc_GCC
- static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) * static_cast<__v4df>(b)); }
- static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) + static_cast<__v4df>(b)); }
- static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) - static_cast<__v4df>(b)); }
- static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); }
- static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); }
- static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); }
- #endif
- static Vc_INTRINSIC m256d Vc_CONST set1_pd (double a) { return _mm256_set1_pd (a); }
- static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int a) { return _mm256_set1_epi32(a); }
- static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
- static Vc_INTRINSIC Vc_CONST m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
- static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
- static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet))); }
- static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
- static Vc_INTRINSIC Vc_CONST m256 setallone_ps() { return _mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
- static Vc_INTRINSIC m256i Vc_CONST setone_epi8 () { return _mm256_set1_epi8(1); }
- static Vc_INTRINSIC m256i Vc_CONST setone_epu8 () { return setone_epi8(); }
- static Vc_INTRINSIC m256i Vc_CONST setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
- static Vc_INTRINSIC m256i Vc_CONST setone_epu16() { return setone_epi16(); }
- static Vc_INTRINSIC m256i Vc_CONST setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
- static Vc_INTRINSIC m256i Vc_CONST setone_epu32() { return setone_epi32(); }
- static Vc_INTRINSIC m256 Vc_CONST setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); }
- static Vc_INTRINSIC m256d Vc_CONST setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); }
- static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::absMaskFloat[0])); }
- static Vc_INTRINSIC m256 Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::absMaskFloat[1])); }
- static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::signMaskFloat[0])); }
- static Vc_INTRINSIC m256 Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1])); }
- static Vc_INTRINSIC m256 Vc_CONST set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); }
- static Vc_INTRINSIC m128 Vc_CONST _mm_set2power31_ps() { return _mm_broadcast_ss(&c_general::_2power31); }
- static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
- static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
- static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); }
- static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
- static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
- static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
- static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
- template <int i>
- static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x)
- {
- return _mm_extract_epi32(x, i);
- }
- template <int offset> Vc_INTRINSIC __m256 insert128(__m256 a, __m128 b) { return _mm256_insertf128_ps(a, b, offset); }
- template <int offset> Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); }
- template <int offset> Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) {
- #ifdef Vc_IMPL_AVX2
- return _mm256_inserti128_si256(a, b, offset);
- #else
- return _mm256_insertf128_si256(a, b, offset);
- #endif
- }
- template <int offset> Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); }
- template <int offset> Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); }
- template <int offset> Vc_INTRINSIC __m128i extract128(__m256i a) {
- #ifdef Vc_IMPL_AVX2
- return _mm256_extracti128_si256(a, offset);
- #else
- return _mm256_extractf128_si256(a, offset);
- #endif
- }
- #ifdef Vc_GCC
- Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a == b); }
- Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a != b); }
- Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a < b); }
- Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a >= b); }
- Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a <= b); }
- Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a > b); }
- Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a == b); }
- Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a != b); }
- Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a < b); }
- Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a >= b); }
- Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a <= b); }
- Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a > b); }
- #else
- Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
- Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
- Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
- Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
- Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
- Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
- Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
- Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
- Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
- Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
- Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
- Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
- #endif
- Vc_INTRINSIC __m256d cmpnlt_pd (__m256d a, __m256d b) { return cmpge_pd(a, b); }
- Vc_INTRINSIC __m256d cmpnle_pd (__m256d a, __m256d b) { return cmpgt_pd(a, b); }
- Vc_INTRINSIC __m256 cmpnlt_ps (__m256 a, __m256 b) { return cmpge_ps(a, b); }
- Vc_INTRINSIC __m256 cmpnle_ps (__m256 a, __m256 b) { return cmpgt_ps(a, b); }
- Vc_INTRINSIC __m256d cmpord_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); }
- Vc_INTRINSIC __m256d cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); }
- Vc_INTRINSIC __m256 cmpord_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); }
- Vc_INTRINSIC __m256 cmpunord_ps(__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); }
- #if defined(Vc_IMPL_XOP)
- static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
- return _mm_comlt_epu16(a, b);
- }
- static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
- return _mm_comgt_epu16(a, b);
- }
- #else
- static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
- return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
- }
- static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
- return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
- {
- return _mm256_alignr_epi8(s1, s2, shift);
- }
- #else
- template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
- {
- return insert128<1>(
- _mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1),
- _mm256_castsi256_si128(s2), shift)),
- _mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift));
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- #define Vc_AVX_TO_SSE_2_NEW(name) \
- Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \
- { \
- return _mm256_##name(a0, b0); \
- }
- #define Vc_AVX_TO_SSE_256_128(name) \
- Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
- { \
- return _mm256_##name(a0, b0); \
- }
- #define Vc_AVX_TO_SSE_1i(name) \
- template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
- { \
- return _mm256_##name(a0, i); \
- }
- #define Vc_AVX_TO_SSE_1(name) \
- Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); }
- #define Vc_AVX_TO_SSE_1_128(name,shift__) \
- Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); }
- #else
- #define Vc_AVX_TO_SSE_1(name) \
- Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) \
- { \
- __m128i a1 = extract128<1>(a0); \
- __m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \
- __m128i r1 = _mm_##name(a1); \
- return insert128<1>(_mm256_castsi128_si256(r0), r1); \
- }
- #define Vc_AVX_TO_SSE_1_128(name,shift__) \
- Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) \
- { \
- __m128i r0 = _mm_##name(a0); \
- __m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__)); \
- return insert128<1>(_mm256_castsi128_si256(r0), r1); \
- }
- #define Vc_AVX_TO_SSE_2_NEW(name) \
- Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \
- { \
- m128i a1 = extract128<1>(a0); \
- m128i b1 = extract128<1>(b0); \
- m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \
- m128i r1 = _mm_##name(a1, b1); \
- return insert128<1>(_mm256_castsi128_si256(r0), r1); \
- }
- #define Vc_AVX_TO_SSE_256_128(name) \
- Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
- { \
- m128i a1 = extract128<1>(a0); \
- m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0); \
- m128i r1 = _mm_##name(a1, b0); \
- return insert128<1>(_mm256_castsi128_si256(r0), r1); \
- }
- #define Vc_AVX_TO_SSE_1i(name) \
- template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
- { \
- m128i a1 = extract128<1>(a0); \
- m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \
- m128i r1 = _mm_##name(a1, i); \
- return insert128<1>(_mm256_castsi128_si256(r0), r1); \
- }
- #endif
- Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); }
- Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); }
- Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); }
- Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); }
- Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); }
- Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); }
- Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); }
- Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); }
- Vc_AVX_TO_SSE_1i(slli_epi16)
- Vc_AVX_TO_SSE_1i(slli_epi32)
- Vc_AVX_TO_SSE_1i(slli_epi64)
- Vc_AVX_TO_SSE_1i(srai_epi16)
- Vc_AVX_TO_SSE_1i(srai_epi32)
- Vc_AVX_TO_SSE_1i(srli_epi16)
- Vc_AVX_TO_SSE_1i(srli_epi32)
- Vc_AVX_TO_SSE_1i(srli_epi64)
- Vc_AVX_TO_SSE_256_128(sll_epi16)
- Vc_AVX_TO_SSE_256_128(sll_epi32)
- Vc_AVX_TO_SSE_256_128(sll_epi64)
- Vc_AVX_TO_SSE_256_128(srl_epi16)
- Vc_AVX_TO_SSE_256_128(srl_epi32)
- Vc_AVX_TO_SSE_256_128(srl_epi64)
- Vc_AVX_TO_SSE_256_128(sra_epi16)
- Vc_AVX_TO_SSE_256_128(sra_epi32)
- Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8)
- Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16)
- Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32)
- Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64)
- Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8)
- Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16)
- Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32)
- Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64)
- Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16)
- Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16)
- Vc_AVX_TO_SSE_2_NEW(add_epi16)
- Vc_AVX_TO_SSE_2_NEW(add_epi32)
- Vc_AVX_TO_SSE_2_NEW(add_epi64)
- Vc_AVX_TO_SSE_2_NEW(sub_epi16)
- Vc_AVX_TO_SSE_2_NEW(sub_epi32)
- Vc_AVX_TO_SSE_2_NEW(mullo_epi16)
- Vc_AVX_TO_SSE_2_NEW(sign_epi16)
- Vc_AVX_TO_SSE_2_NEW(sign_epi32)
- Vc_AVX_TO_SSE_2_NEW(min_epi8)
- Vc_AVX_TO_SSE_2_NEW(max_epi8)
- Vc_AVX_TO_SSE_2_NEW(min_epu16)
- Vc_AVX_TO_SSE_2_NEW(max_epu16)
- Vc_AVX_TO_SSE_2_NEW(min_epi32)
- Vc_AVX_TO_SSE_2_NEW(max_epi32)
- Vc_AVX_TO_SSE_2_NEW(min_epu32)
- Vc_AVX_TO_SSE_2_NEW(max_epu32)
- Vc_AVX_TO_SSE_2_NEW(mullo_epi32)
- Vc_AVX_TO_SSE_1(abs_epi8)
- Vc_AVX_TO_SSE_1(abs_epi16)
- Vc_AVX_TO_SSE_1(abs_epi32)
- Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8)
- Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4)
- Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2)
- Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8)
- Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4)
- Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8)
- Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8)
- Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4)
- Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2)
- Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8)
- Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4)
- Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8)
- #ifndef Vc_IMPL_AVX2
- static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) {
- return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
- }
- static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) {
- return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
- }
- static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) {
- return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
- }
- static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) {
- return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
- }
- Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
- {
- m128i a1 = extract128<1>(a0);
- return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
- }
- template <int m> Vc_INTRINSIC Vc_CONST m256i blend_epi16(__m256i a0, __m256i b0)
- {
- m128i a1 = extract128<1>(a0);
- m128i b1 = extract128<1>(b0);
- m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
- m128i r1 = _mm_blend_epi16(a1, b1, m >> 8);
- return insert128<1>(_mm256_castsi128_si256(r0), r1);
- }
- Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) {
- m128i a1 = extract128<1>(a0);
- m128i b1 = extract128<1>(b0);
- m128i m1 = extract128<1>(m0);
- m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
- m128i r1 = _mm_blendv_epi8(a1, b1, m1);
- return insert128<1>(_mm256_castsi128_si256(r0), r1);
- }
- #else
- static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); }
- static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); }
- static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); }
- static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); }
- Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0)
- {
- return _mm256_blendv_epi8(a0, b0, m0);
- }
- Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
- {
- return _mm256_movemask_epi8(a0);
- }
- #endif
- static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) {
- return cmpgt_epi64(b, a);
- }
- static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) {
- return cmpgt_epi32(b, a);
- }
- static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) {
- return cmpgt_epi16(b, a);
- }
- static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) {
- return cmpgt_epi8(b, a);
- }
- static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) {
- return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8()));
- }
- #if defined(Vc_IMPL_XOP)
- Vc_AVX_TO_SSE_2_NEW(comlt_epu32)
- Vc_AVX_TO_SSE_2_NEW(comgt_epu32)
- Vc_AVX_TO_SSE_2_NEW(comlt_epu16)
- Vc_AVX_TO_SSE_2_NEW(comgt_epu16)
- static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); }
- static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); }
- static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); }
- static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); }
- #else
- static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) {
- m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
- m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
- return cmplt_epi32(a, b);
- }
- static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) {
- m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
- m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
- return cmpgt_epi32(a, b);
- }
- static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) {
- m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
- m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
- return cmplt_epi16(a, b);
- }
- static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) {
- m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
- m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
- return cmpgt_epi16(a, b);
- }
- #endif
- static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) {
- _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v);
- }
- static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) {
- _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v);
- }
- static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) {
- #ifdef Vc_IMPL_AVX2
- _mm256_maskstore_epi32(mem, mask, v);
- #else
- _mm256_maskstore_ps(reinterpret_cast<float *>(mem), mask, _mm256_castsi256_ps(v));
- #endif
- }
- static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) {
- _mm256_maskstore(reinterpret_cast<int *>(mem), mask, v);
- }
- static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) {
- using namespace AVX;
- _mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast<char *>(&mem[0]));
- _mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast<char *>(&mem[8]));
- }
- static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) {
- _mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
- }
- #undef Vc_AVX_TO_SSE_1
- #undef Vc_AVX_TO_SSE_1_128
- #undef Vc_AVX_TO_SSE_2_NEW
- #undef Vc_AVX_TO_SSE_256_128
- #undef Vc_AVX_TO_SSE_1i
- template<typename R> Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R;
- template<> Vc_INTRINSIC m128 stream_load<m128>(const float *mem)
- {
- return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
- }
- template<> Vc_INTRINSIC m256 stream_load<m256>(const float *mem)
- {
- return insert128<1>(_mm256_castps128_ps256(stream_load<m128>(mem)),
- stream_load<m128>(mem + 4));
- }
- template<typename R> Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R;
- template<> Vc_INTRINSIC m128d stream_load<m128d>(const double *mem)
- {
- return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
- }
- template<> Vc_INTRINSIC m256d stream_load<m256d>(const double *mem)
- {
- return insert128<1>(_mm256_castpd128_pd256(stream_load<m128d>(mem)),
- stream_load<m128d>(mem + 2));
- }
- template<typename R> Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R;
- template<> Vc_INTRINSIC m128i stream_load<m128i>(const void *mem)
- {
- return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<void *>(mem)));
- }
- template<> Vc_INTRINSIC m256i stream_load<m256i>(const void *mem)
- {
- return insert128<1>(_mm256_castsi128_si256(stream_load<m128i>(mem)),
- stream_load<m128i>(static_cast<const __m128i *>(mem) + 1));
- }
- Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask)
- {
- _mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast<char *>(mem));
- }
- Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask)
- {
- stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask));
- stream_store(mem + 4, extract128<1>(value), extract128<1>(mask));
- }
- Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask)
- {
- _mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast<char *>(mem));
- }
- Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask)
- {
- stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask));
- stream_store(mem + 2, extract128<1>(value), extract128<1>(mask));
- }
- Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask)
- {
- _mm_maskmoveu_si128(value, mask, reinterpret_cast<char *>(mem));
- }
- Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask)
- {
- stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask));
- stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask));
- }
- #ifndef __x86_64__
- Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
- return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <int Scale> __m256 gather(const float *addr, __m256i idx)
- {
- return _mm256_i32gather_ps(addr, idx, Scale);
- }
- template <int Scale> __m256d gather(const double *addr, __m128i idx)
- {
- return _mm256_i32gather_pd(addr, idx, Scale);
- }
- template <int Scale> __m256i gather(const int *addr, __m256i idx)
- {
- return _mm256_i32gather_epi32(addr, idx, Scale);
- }
- template <int Scale> __m256i gather(const unsigned *addr, __m256i idx)
- {
- return _mm256_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
- }
- template <int Scale> __m256 gather(__m256 src, __m256 k, const float *addr, __m256i idx)
- {
- return _mm256_mask_i32gather_ps(src, addr, idx, k, Scale);
- }
- template <int Scale>
- __m256d gather(__m256d src, __m256d k, const double *addr, __m128i idx)
- {
- return _mm256_mask_i32gather_pd(src, addr, idx, k, Scale);
- }
- template <int Scale> __m256i gather(__m256i src, __m256i k, const int *addr, __m256i idx)
- {
- return _mm256_mask_i32gather_epi32(src, addr, idx, k, Scale);
- }
- template <int Scale>
- __m256i gather(__m256i src, __m256i k, const unsigned *addr, __m256i idx)
- {
- return _mm256_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
- }
- #endif
- }
- }
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace AVX
- {
- using namespace AvxIntrinsics;
- }
- namespace AVX2
- {
- using namespace AvxIntrinsics;
- }
- namespace AVX
- {
- template<typename T> struct VectorTypeHelper;
- template<> struct VectorTypeHelper< char > { typedef __m256i Type; };
- template<> struct VectorTypeHelper< signed char > { typedef __m256i Type; };
- template<> struct VectorTypeHelper<unsigned char > { typedef __m256i Type; };
- template<> struct VectorTypeHelper< short> { typedef __m256i Type; };
- template<> struct VectorTypeHelper<unsigned short> { typedef __m256i Type; };
- template<> struct VectorTypeHelper< int > { typedef __m256i Type; };
- template<> struct VectorTypeHelper<unsigned int > { typedef __m256i Type; };
- template<> struct VectorTypeHelper< long > { typedef __m256i Type; };
- template<> struct VectorTypeHelper<unsigned long > { typedef __m256i Type; };
- template<> struct VectorTypeHelper< long long> { typedef __m256i Type; };
- template<> struct VectorTypeHelper<unsigned long long> { typedef __m256i Type; };
- template<> struct VectorTypeHelper< float> { typedef __m256 Type; };
- template<> struct VectorTypeHelper< double> { typedef __m256d Type; };
- template <typename T>
- using IntegerVectorType =
- typename std::conditional<sizeof(T) == 16, __m128i, __m256i>::type;
- template <typename T>
- using DoubleVectorType =
- typename std::conditional<sizeof(T) == 16, __m128d, __m256d>::type;
- template <typename T>
- using FloatVectorType =
- typename std::conditional<sizeof(T) == 16, __m128, __m256>::type;
- template<typename T> struct VectorHelper {};
- template<typename T> struct VectorHelperSize;
- }
- }
- #endif
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <typename V> inline V zero();
- }
- namespace Common
- {
- namespace Detail
- {
- #ifdef Vc_IMPL_AVX
- template <typename ValueType, size_t Size> struct IntrinsicType {
- using type = typename std::conditional<
- std::is_integral<ValueType>::value,
- typename std::conditional<sizeof(ValueType) * Size == 16, __m128i, __m256i>::type,
- typename std::conditional<
- std::is_same<ValueType, double>::value,
- typename std::conditional<sizeof(ValueType) * Size == 16, __m128d,
- __m256d>::type,
- typename std::conditional<sizeof(ValueType) * Size == 16, __m128,
- __m256>::type>::type>::type;
- };
- #elif defined Vc_IMPL_SSE
- template <typename ValueType, size_t Size> struct IntrinsicType {
- using type = typename std::conditional<
- std::is_integral<ValueType>::value, __m128i,
- typename std::conditional<std::is_same<ValueType, double>::value, __m128d,
- __m128>::type>::type;
- };
- #else
- template <typename ValueType, size_t Size> struct IntrinsicType {
- static_assert(Size == 1,
- "IntrinsicType without SIMD target support may only have Size = 1");
- using type = ValueType;
- };
- #endif
- template <typename ValueType, size_t Size, size_t Bytes = sizeof(ValueType) * Size>
- struct BuiltinType;
- #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
- #define Vc_VECBUILTIN __attribute__((__vector_size__(16)))
- template <size_t Size> struct BuiltinType< double , Size, 16> { typedef double type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< float , Size, 16> { typedef float type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< long long, Size, 16> { typedef long long type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType<unsigned long long, Size, 16> { typedef unsigned long long type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< long , Size, 16> { typedef long type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType<unsigned long , Size, 16> { typedef unsigned long type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< int , Size, 16> { typedef int type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType<unsigned int , Size, 16> { typedef unsigned int type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< short , Size, 16> { typedef short type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType<unsigned short , Size, 16> { typedef unsigned short type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< char , Size, 16> { typedef char type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType<unsigned char , Size, 16> { typedef unsigned char type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< signed char , Size, 16> { typedef signed char type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< bool , Size, 16> { typedef unsigned char type Vc_VECBUILTIN; };
- #undef Vc_VECBUILTIN
- #define Vc_VECBUILTIN __attribute__((__vector_size__(32)))
- template <size_t Size> struct BuiltinType< double , Size, 32> { typedef double type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< float , Size, 32> { typedef float type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< long long, Size, 32> { typedef long long type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType<unsigned long long, Size, 32> { typedef unsigned long long type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< long , Size, 32> { typedef long type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType<unsigned long , Size, 32> { typedef unsigned long type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< int , Size, 32> { typedef int type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType<unsigned int , Size, 32> { typedef unsigned int type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< short , Size, 32> { typedef short type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType<unsigned short , Size, 32> { typedef unsigned short type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< char , Size, 32> { typedef char type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType<unsigned char , Size, 32> { typedef unsigned char type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< signed char , Size, 32> { typedef signed char type Vc_VECBUILTIN; };
- template <size_t Size> struct BuiltinType< bool , Size, 32> { typedef unsigned char type Vc_VECBUILTIN; };
- #undef Vc_VECBUILTIN
- #endif
- }
- template <typename ValueType, size_t Size>
- using IntrinsicType = typename Detail::IntrinsicType<ValueType, Size>::type;
- template <typename ValueType, size_t Size>
- using BuiltinType = typename Detail::BuiltinType<ValueType, Size>::type;
- namespace AliasStrategy
- {
- struct Union {};
- struct MayAlias {};
- struct VectorBuiltin {};
- struct UnionMembers {};
- }
- using DefaultStrategy =
- #if defined Vc_USE_BUILTIN_VECTOR_TYPES
- AliasStrategy::VectorBuiltin;
- #elif defined Vc_MSVC
- AliasStrategy::UnionMembers;
- #elif defined Vc_ICC
- AliasStrategy::Union;
- #elif defined __GNUC__
- AliasStrategy::MayAlias;
- #else
- AliasStrategy::Union;
- #endif
- template <typename ValueType, size_t Size, typename Strategy = DefaultStrategy>
- class Storage;
- template <typename ValueType, size_t Size>
- class Storage<ValueType, Size, AliasStrategy::Union>
- {
- static_assert(std::is_fundamental<ValueType>::value &&
- std::is_arithmetic<ValueType>::value,
- "Only works for fundamental arithmetic types.");
- public:
- using VectorType = IntrinsicType<ValueType, Size>;
- using EntryType = ValueType;
- union Alias {
- Vc_INTRINSIC Alias(VectorType vv) : v(vv) {}
- VectorType v;
- EntryType m[Size];
- };
- Vc_INTRINSIC Storage() : data(Vc::Detail::zero<VectorType>()) {}
- Vc_INTRINSIC Storage(const VectorType &x) : data(x) { assertCorrectAlignment(&data); }
- template <typename U>
- Vc_INTRINSIC explicit Storage(const U &x,
- enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
- : data(reinterpret_cast<VectorType>(x))
- {
- assertCorrectAlignment(&data);
- }
- Vc_INTRINSIC Storage(const Storage &) = default;
- Vc_INTRINSIC Storage &operator=(const Storage &) = default;
- Vc_INTRINSIC operator const VectorType &() const { return data; }
- Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
- Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
- Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return Alias(data).m[i]; }
- Vc_INTRINSIC void set(size_t i, EntryType x)
- {
- Alias a(data);
- a.m[i] = x;
- data = a.v;
- }
- private:
- VectorType data;
- };
- template <typename ValueType, size_t Size>
- class Storage<ValueType, Size, AliasStrategy::MayAlias>
- {
- static_assert(std::is_fundamental<ValueType>::value &&
- std::is_arithmetic<ValueType>::value,
- "Only works for fundamental arithmetic types.");
- public:
- using VectorType = IntrinsicType<ValueType, Size>;
- using EntryType = ValueType;
- Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
- Vc_INTRINSIC Storage(const VectorType &x) : data(x)
- {
- assertCorrectAlignment(&data);
- }
- template <typename U>
- Vc_INTRINSIC explicit Storage(const U &x,
- enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
- : data(reinterpret_cast<const VectorType &>(x))
- {
- assertCorrectAlignment(&data);
- }
- Vc_INTRINSIC Storage &operator=(const VectorType &x)
- {
- data = x;
- return *this;
- }
- Vc_INTRINSIC Storage(const Storage &) = default;
- Vc_INTRINSIC Storage &operator=(const Storage &) = default;
- Vc_INTRINSIC operator const VectorType &() const { return v(); }
- Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
- Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
- Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const
- {
- return aliasing_cast<EntryType>(&data)[i];
- }
- Vc_INTRINSIC void set(size_t i, EntryType x)
- {
- aliasing_cast<EntryType>(&data)[i] = x;
- }
- private:
- VectorType data;
- };
- template <typename ValueType, size_t Size>
- class Storage<ValueType, Size, AliasStrategy::VectorBuiltin>
- {
- static_assert(std::is_fundamental<ValueType>::value &&
- std::is_arithmetic<ValueType>::value,
- "Only works for fundamental arithmetic types.");
- using Builtin = BuiltinType<ValueType, Size>;
- public:
- using VectorType =
- #ifdef Vc_TEMPLATES_DROP_ATTRIBUTES
- MayAlias<IntrinsicType<ValueType, Size>>;
- #else
- IntrinsicType<ValueType, Size>;
- #endif
- using EntryType = ValueType;
- Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
- Vc_INTRINSIC Storage(const Storage &) = default;
- Vc_INTRINSIC Storage &operator=(const Storage &) = default;
- Vc_INTRINSIC Storage(const VectorType &x)
- : data(aliasing_cast<Builtin>(x))
- {
- assertCorrectAlignment(&data);
- }
- template <typename U>
- Vc_INTRINSIC explicit Storage(const U &x,
- enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
- : data(aliasing_cast<Builtin>(x))
- {
- assertCorrectAlignment(&data);
- }
- Vc_INTRINSIC Storage &operator=(const VectorType &x)
- {
- data = aliasing_cast<Builtin>(x);
- return *this;
- }
- Vc_INTRINSIC operator const VectorType &() const { return v(); }
- Vc_INTRINSIC Vc_PURE VectorType &v() { return reinterpret_cast<VectorType &>(data); }
- Vc_INTRINSIC Vc_PURE const VectorType &v() const { return reinterpret_cast<const VectorType &>(data); }
- Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return data[i]; }
- Vc_INTRINSIC void set(size_t i, EntryType x) { data[i] = x; }
- Vc_INTRINSIC Builtin &builtin() { return data; }
- Vc_INTRINSIC const Builtin &builtin() const { return data; }
- private:
- Builtin data;
- };
- template <typename ValueType, size_t Size>
- class Storage<ValueType, Size, AliasStrategy::UnionMembers>
- {
- static_assert(std::is_fundamental<ValueType>::value &&
- std::is_arithmetic<ValueType>::value,
- "Only works for fundamental arithmetic types.");
- public:
- using VectorType = IntrinsicType<ValueType, Size>;
- using EntryType = ValueType;
- Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
- Vc_INTRINSIC Storage(const VectorType &x) : data(x)
- {
- assertCorrectAlignment(&data);
- }
- template <typename U>
- Vc_INTRINSIC explicit Storage(const U &x,
- enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
- : data(reinterpret_cast<const VectorType &>(x))
- {
- assertCorrectAlignment(&data);
- }
- Vc_INTRINSIC Storage &operator=(const VectorType &x)
- {
- data = x;
- return *this;
- }
- Vc_INTRINSIC Storage(const Storage &) = default;
- Vc_INTRINSIC Storage &operator=(const Storage &) = default;
- Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
- Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
- Vc_INTRINSIC_L Vc_PURE_L EntryType m(size_t i) const Vc_INTRINSIC_R Vc_PURE_R;
- Vc_INTRINSIC void set(size_t i, EntryType x) { ref(i) = x; }
- private:
- Vc_INTRINSIC_L Vc_PURE_L EntryType &ref(size_t i) Vc_INTRINSIC_R Vc_PURE_R;
- VectorType data;
- };
- #ifdef Vc_MSVC
- template <> Vc_INTRINSIC Vc_PURE double Storage< double, 2, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128d_f64[i]; }
- template <> Vc_INTRINSIC Vc_PURE float Storage< float , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128_f32[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed int Storage< signed int , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i32[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed short Storage< signed short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i16[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed char Storage< signed char ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i8[i]; }
- template <> Vc_INTRINSIC Vc_PURE unsigned int Storage<unsigned int , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u32[i]; }
- template <> Vc_INTRINSIC Vc_PURE unsigned short Storage<unsigned short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u16[i]; }
- template <> Vc_INTRINSIC Vc_PURE unsigned char Storage<unsigned char ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u8[i]; }
- template <> Vc_INTRINSIC Vc_PURE double &Storage< double, 2, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128d_f64[i]; }
- template <> Vc_INTRINSIC Vc_PURE float &Storage< float , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128_f32[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed int &Storage< signed int , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i32[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed short &Storage< signed short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i16[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed char &Storage< signed char ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m128i_i8[i]); }
- template <> Vc_INTRINSIC Vc_PURE unsigned int &Storage<unsigned int , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u32[i]; }
- template <> Vc_INTRINSIC Vc_PURE unsigned short &Storage<unsigned short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u16[i]; }
- template <> Vc_INTRINSIC Vc_PURE unsigned char &Storage<unsigned char ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u8[i]; }
- #ifdef Vc_IMPL_AVX
- template <> Vc_INTRINSIC Vc_PURE double Storage< double, 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256d_f64[i]; }
- template <> Vc_INTRINSIC Vc_PURE float Storage< float , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256_f32[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed int Storage< signed int , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i32[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed short Storage< signed short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i16[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed char Storage< signed char ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i8[i]; }
- template <> Vc_INTRINSIC Vc_PURE unsigned int Storage<unsigned int , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u32[i]; }
- template <> Vc_INTRINSIC Vc_PURE unsigned short Storage<unsigned short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u16[i]; }
- template <> Vc_INTRINSIC Vc_PURE unsigned char Storage<unsigned char ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u8[i]; }
- template <> Vc_INTRINSIC Vc_PURE double &Storage< double, 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256d_f64[i]; }
- template <> Vc_INTRINSIC Vc_PURE float &Storage< float , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256_f32[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed int &Storage< signed int , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i32[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed short &Storage< signed short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i16[i]; }
- template <> Vc_INTRINSIC Vc_PURE signed char &Storage< signed char ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m256i_i8[i]); }
- template <> Vc_INTRINSIC Vc_PURE unsigned int &Storage<unsigned int , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u32[i]; }
- template <> Vc_INTRINSIC Vc_PURE unsigned short &Storage<unsigned short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u16[i]; }
- template <> Vc_INTRINSIC Vc_PURE unsigned char &Storage<unsigned char ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u8[i]; }
- #endif
- #endif
- template <typename VectorType, typename EntryType>
- using VectorMemoryUnion = Storage<EntryType, sizeof(VectorType) / sizeof(EntryType)>;
- }
- }
- #endif
- #ifndef VC_SSE_CONST_DATA_H_
- #define VC_SSE_CONST_DATA_H_
- #ifndef VC_SSE_MACROS_H_
- #define VC_SSE_MACROS_H_
- #if defined(Vc_IMPL_SSE4_1) && !defined(Vc_DISABLE_PTEST)
- #define Vc_USE_PTEST
- #endif
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SSE
- {
- alignas(16) extern const unsigned int _IndexesFromZero4[4];
- alignas(16) extern const unsigned short _IndexesFromZero8[8];
- alignas(16) extern const unsigned char _IndexesFromZero16[16];
- struct c_general
- {
- alignas(64) static const int absMaskFloat[4];
- alignas(16) static const unsigned int signMaskFloat[4];
- alignas(16) static const unsigned int highMaskFloat[4];
- alignas(16) static const short minShort[8];
- alignas(16) static const unsigned short one16[8];
- alignas(16) static const unsigned int one32[4];
- alignas(16) static const float oneFloat[4];
- alignas(16) static const unsigned long long highMaskDouble[2];
- alignas(16) static const double oneDouble[2];
- alignas(16) static const long long absMaskDouble[2];
- alignas(16) static const unsigned long long signMaskDouble[2];
- alignas(16) static const unsigned long long frexpMask[2];
- };
- template<typename T> struct c_trig
- {
- alignas(64) static const T data[];
- };
- #ifndef Vc_MSVC
- template <> alignas(64) const float c_trig<float>::data[];
- template <> alignas(64) const double c_trig<double>::data[];
- #endif
- template<typename T> struct c_log
- {
- enum VectorSize { Size = 16 / sizeof(T) };
- static Vc_ALWAYS_INLINE Vc_CONST const float *d(int i) { return reinterpret_cast<const float *>(&data[i * Size]); }
- alignas(64) static const unsigned int data[21 * Size];
- };
- #ifndef Vc_MSVC
- template<> alignas(64) const unsigned int c_log<float>::data[21 * 4];
- #endif
- template<> struct c_log<double>
- {
- enum VectorSize { Size = 16 / sizeof(double) };
- static Vc_ALWAYS_INLINE Vc_CONST const double *d(int i) { return reinterpret_cast<const double *>(&data[i * Size]); }
- alignas(64) static const unsigned long long data[21 * Size];
- };
- }
- }
- #endif
- #include <cstdlib>
- #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wold-style-cast"
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SseIntrinsics
- {
- using SSE::c_general;
- constexpr std::size_t VectorAlignment = 16;
- #if defined(Vc_GCC) && Vc_GCC < 0x40600 && !defined(Vc_DONT_FIX_SSE_SHIFT)
- static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
- static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
- static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
- static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
- static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
- static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
- #endif
- #ifdef Vc_GCC
- static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
- static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
- static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
- static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
- static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
- static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
- #endif
- static Vc_INTRINSIC Vc_CONST __m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
- static Vc_INTRINSIC Vc_CONST __m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
- static Vc_INTRINSIC Vc_CONST __m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
- static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
- static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); }
- static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
- static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); }
- static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); }
- static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); }
- static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
- static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
- static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
- static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
- static Vc_INTRINSIC __m128i Vc_CONST setmin_epi8 () { return _mm_set1_epi8(-0x80); }
- static Vc_INTRINSIC __m128i Vc_CONST setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
- static Vc_INTRINSIC __m128i Vc_CONST setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
- #if defined(Vc_IMPL_XOP)
- static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b) { return _mm_comgt_epu8(a, b); }
- static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); }
- static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); }
- static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b) { return _mm_comlt_epu32(a, b); }
- static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b) { return _mm_comgt_epu32(a, b); }
- static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu64(__m128i a, __m128i b) { return _mm_comlt_epu64(a, b); }
- #else
- static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b)
- {
- return _mm_cmpgt_epi8(_mm_xor_si128(a, setmin_epi8()),
- _mm_xor_si128(b, setmin_epi8()));
- }
- static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b)
- {
- return _mm_cmplt_epi16(_mm_xor_si128(a, setmin_epi16()),
- _mm_xor_si128(b, setmin_epi16()));
- }
- static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b)
- {
- return _mm_cmpgt_epi16(_mm_xor_si128(a, setmin_epi16()),
- _mm_xor_si128(b, setmin_epi16()));
- }
- static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b)
- {
- return _mm_cmplt_epi32(_mm_xor_si128(a, setmin_epi32()),
- _mm_xor_si128(b, setmin_epi32()));
- }
- static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b)
- {
- return _mm_cmpgt_epi32(_mm_xor_si128(a, setmin_epi32()),
- _mm_xor_si128(b, setmin_epi32()));
- }
- Vc_INTRINSIC __m128i Vc_CONST cmpgt_epi64(__m128i a, __m128i b)
- {
- #ifdef Vc_IMPL_SSE4_2
- return _mm_cmpgt_epi64(a, b);
- #else
- const auto aa = _mm_xor_si128(a, _mm_srli_epi64(setmin_epi32(),32));
- const auto bb = _mm_xor_si128(b, _mm_srli_epi64(setmin_epi32(),32));
- const auto gt = _mm_cmpgt_epi32(aa, bb);
- const auto eq = _mm_cmpeq_epi32(aa, bb);
- const auto gt2 =
- _mm_shuffle_epi32(gt, 0xf5);
- const auto lo =
- _mm_shuffle_epi32(_mm_and_si128(_mm_srli_epi64(eq, 32), gt), 0xa0);
- return _mm_or_si128(gt2, lo);
- #endif
- }
- #endif
- }
- }
- #ifdef Vc_IMPL_SSSE3
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SseIntrinsics
- {
- Vc_INTRINSIC Vc_CONST __m128i abs_epi8(__m128i a) { return _mm_abs_epi8(a); }
- Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) { return _mm_abs_epi16(a); }
- Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) { return _mm_abs_epi32(a); }
- template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
- {
- return _mm_alignr_epi8(a, b, s & 0x1fu);
- }
- }
- }
- #else
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SseIntrinsics
- {
- Vc_INTRINSIC Vc_CONST __m128i abs_epi8 (__m128i a) {
- __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
- return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_set1_epi8(1)));
- }
- Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) {
- __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
- return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
- }
- Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) {
- __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
- return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
- }
- template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
- {
- switch (s & 0x1fu) {
- case 0: return b;
- case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1));
- case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2));
- case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3));
- case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4));
- case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5));
- case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6));
- case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7));
- case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8));
- case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9));
- case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10));
- case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11));
- case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12));
- case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13));
- case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14));
- case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15));
- case 16: return a;
- case 17: return _mm_srli_si128(a, 1);
- case 18: return _mm_srli_si128(a, 2);
- case 19: return _mm_srli_si128(a, 3);
- case 20: return _mm_srli_si128(a, 4);
- case 21: return _mm_srli_si128(a, 5);
- case 22: return _mm_srli_si128(a, 6);
- case 23: return _mm_srli_si128(a, 7);
- case 24: return _mm_srli_si128(a, 8);
- case 25: return _mm_srli_si128(a, 9);
- case 26: return _mm_srli_si128(a, 10);
- case 27: return _mm_srli_si128(a, 11);
- case 28: return _mm_srli_si128(a, 12);
- case 29: return _mm_srli_si128(a, 13);
- case 30: return _mm_srli_si128(a, 14);
- case 31: return _mm_srli_si128(a, 15);
- }
- return _mm_setzero_si128();
- }
- }
- }
- #endif
- #ifdef Vc_IMPL_SSE4_1
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SseIntrinsics
- {
- Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b)
- {
- return _mm_cmpeq_epi64(a, b);
- }
- template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
- {
- return _mm_extract_epi32(v, index);
- }
- Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c)
- {
- return _mm_blendv_pd(a, b, c);
- }
- Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c)
- {
- return _mm_blendv_ps(a, b, c);
- }
- Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c)
- {
- return _mm_blendv_epi8(a, b, c);
- }
- template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
- {
- return _mm_blend_pd(a, b, mask);
- }
- template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
- {
- return _mm_blend_ps(a, b, mask);
- }
- template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
- {
- return _mm_blend_epi16(a, b, mask);
- }
- Vc_INTRINSIC Vc_CONST __m128i max_epi8(__m128i a, __m128i b)
- {
- return _mm_max_epi8(a, b);
- }
- Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b)
- {
- return _mm_max_epi32(a, b);
- }
- Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b)
- {
- return _mm_max_epu16(a, b);
- }
- Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b)
- {
- return _mm_max_epu32(a, b);
- }
- Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b)
- {
- return _mm_min_epu16(a, b);
- }
- Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b)
- {
- return _mm_min_epu32(a, b);
- }
- Vc_INTRINSIC Vc_CONST __m128i min_epi8(__m128i a, __m128i b)
- {
- return _mm_min_epi8(a, b);
- }
- Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b)
- {
- return _mm_min_epi32(a, b);
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8)
- {
- return _mm_cvtepu8_epi16(epu8);
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8)
- {
- return _mm_cvtepi8_epi16(epi8);
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16)
- {
- return _mm_cvtepu16_epi32(epu16);
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16)
- {
- return _mm_cvtepi16_epi32(epu16);
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8)
- {
- return _mm_cvtepu8_epi32(epu8);
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8)
- {
- return _mm_cvtepi8_epi32(epi8);
- }
- }
- }
- #else
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SseIntrinsics
- {
- Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b) {
- auto tmp = _mm_cmpeq_epi32(a, b);
- return _mm_and_si128(tmp, _mm_shuffle_epi32(tmp, 1*1 + 0*4 + 3*16 + 2*64));
- }
- template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
- {
- #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
- typedef int int32v4 __attribute__((__vector_size__(16)));
- return aliasing_cast<int32v4>(v)[index];
- #else
- return _mm_cvtsi128_si32(_mm_srli_si128(v, index * 4));
- #endif
- }
- Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c) {
- #ifdef Vc_GCC
- return reinterpret_cast<__m128d>(
- (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
- (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
- #else
- return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
- #endif
- }
- Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c) {
- #ifdef Vc_GCC
- return reinterpret_cast<__m128>(
- (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
- (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
- #else
- return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
- #endif
- }
- Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c) {
- #ifdef Vc_GCC
- return (~c & a) | (c & b);
- #else
- return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
- #endif
- }
- template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
- {
- switch (mask) {
- case 0x0:
- return a;
- case 0x1:
- return _mm_shuffle_pd(b, a, 2);
- case 0x2:
- return _mm_shuffle_pd(a, b, 2);
- case 0x3:
- return b;
- default:
- abort();
- return a;
- }
- }
- template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
- {
- __m128i c;
- switch (mask) {
- case 0x0:
- return a;
- case 0x1:
- c = _mm_srli_si128(_mm_setallone_si128(), 12);
- break;
- case 0x2:
- c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
- break;
- case 0x3:
- c = _mm_srli_si128(_mm_setallone_si128(), 8);
- break;
- case 0x4:
- c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
- break;
- case 0x5:
- c = _mm_set_epi32(0, -1, 0, -1);
- break;
- case 0x6:
- c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
- break;
- case 0x7:
- c = _mm_srli_si128(_mm_setallone_si128(), 4);
- break;
- case 0x8:
- c = _mm_slli_si128(_mm_setallone_si128(), 12);
- break;
- case 0x9:
- c = _mm_set_epi32(-1, 0, 0, -1);
- break;
- case 0xa:
- c = _mm_set_epi32(-1, 0, -1, 0);
- break;
- case 0xb:
- c = _mm_set_epi32(-1, 0, -1, -1);
- break;
- case 0xc:
- c = _mm_slli_si128(_mm_setallone_si128(), 8);
- break;
- case 0xd:
- c = _mm_set_epi32(-1, -1, 0, -1);
- break;
- case 0xe:
- c = _mm_slli_si128(_mm_setallone_si128(), 4);
- break;
- case 0xf:
- return b;
- default:
- abort();
- c = _mm_setzero_si128();
- break;
- }
- __m128 _c = _mm_castsi128_ps(c);
- return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
- }
- template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
- {
- __m128i c;
- switch (mask) {
- case 0x00:
- return a;
- case 0x01:
- c = _mm_srli_si128(_mm_setallone_si128(), 14);
- break;
- case 0x03:
- c = _mm_srli_si128(_mm_setallone_si128(), 12);
- break;
- case 0x07:
- c = _mm_srli_si128(_mm_setallone_si128(), 10);
- break;
- case 0x0f:
- return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
- case 0x1f:
- c = _mm_srli_si128(_mm_setallone_si128(), 6);
- break;
- case 0x3f:
- c = _mm_srli_si128(_mm_setallone_si128(), 4);
- break;
- case 0x7f:
- c = _mm_srli_si128(_mm_setallone_si128(), 2);
- break;
- case 0x80:
- c = _mm_slli_si128(_mm_setallone_si128(), 14);
- break;
- case 0xc0:
- c = _mm_slli_si128(_mm_setallone_si128(), 12);
- break;
- case 0xe0:
- c = _mm_slli_si128(_mm_setallone_si128(), 10);
- break;
- case 0xf0:
- c = _mm_slli_si128(_mm_setallone_si128(), 8);
- break;
- case 0xf8:
- c = _mm_slli_si128(_mm_setallone_si128(), 6);
- break;
- case 0xfc:
- c = _mm_slli_si128(_mm_setallone_si128(), 4);
- break;
- case 0xfe:
- c = _mm_slli_si128(_mm_setallone_si128(), 2);
- break;
- case 0xff:
- return b;
- case 0xcc:
- return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
- case 0x33:
- return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
- default:
- const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
- c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
- break;
- }
- return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
- }
- Vc_INTRINSIC Vc_CONST __m128i max_epi8 (__m128i a, __m128i b) {
- return blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
- }
- Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b) {
- return blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
- }
- Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b) {
- return blendv_epi8(b, a, cmpgt_epu16(a, b));
- }
- Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b) {
- return blendv_epi8(b, a, cmpgt_epu32(a, b));
- }
- Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b) {
- return blendv_epi8(a, b, cmpgt_epu16(a, b));
- }
- Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b) {
- return blendv_epi8(a, b, cmpgt_epu32(a, b));
- }
- Vc_INTRINSIC Vc_CONST __m128i min_epi8 (__m128i a, __m128i b) {
- return blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
- }
- Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) {
- return blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8) {
- return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8) {
- return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16) {
- return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16) {
- return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8) {
- return cvtepu16_epi32(cvtepu8_epi16(epu8));
- }
- Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8) {
- const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
- const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
- return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
- }
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SseIntrinsics
- {
- static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) {
- #ifdef Vc_IMPL_SSE4_1
- return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
- #else
- return _mm_load_ps(mem);
- #endif
- }
- static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
- #ifdef Vc_IMPL_SSE4_1
- return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
- #else
- return _mm_load_pd(mem);
- #endif
- }
- static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
- #ifdef Vc_IMPL_SSE4_1
- return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
- #else
- return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
- #endif
- }
- static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
- return _mm_stream_load(reinterpret_cast<const int *>(mem));
- }
- static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
- return _mm_stream_load(reinterpret_cast<const int *>(mem));
- }
- static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
- return _mm_stream_load(reinterpret_cast<const int *>(mem));
- }
- static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
- return _mm_stream_load(reinterpret_cast<const int *>(mem));
- }
- static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
- return _mm_stream_load(reinterpret_cast<const int *>(mem));
- }
- #ifndef __x86_64__
- Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
- return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <int Scale> __m128 gather(const float *addr, __m128i idx)
- {
- return _mm_i32gather_ps(addr, idx, Scale);
- }
- template <int Scale> __m128d gather(const double *addr, __m128i idx)
- {
- return _mm_i32gather_pd(addr, idx, Scale);
- }
- template <int Scale> __m128i gather(const int *addr, __m128i idx)
- {
- return _mm_i32gather_epi32(addr, idx, Scale);
- }
- template <int Scale> __m128i gather(const unsigned *addr, __m128i idx)
- {
- return _mm_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
- }
- template <int Scale> __m128 gather(__m128 src, __m128 k, const float *addr, __m128i idx)
- {
- return _mm_mask_i32gather_ps(src, addr, idx, k, Scale);
- }
- template <int Scale>
- __m128d gather(__m128d src, __m128d k, const double *addr, __m128i idx)
- {
- return _mm_mask_i32gather_pd(src, addr, idx, k, Scale);
- }
- template <int Scale> __m128i gather(__m128i src, __m128i k, const int *addr, __m128i idx)
- {
- return _mm_mask_i32gather_epi32(src, addr, idx, k, Scale);
- }
- template <int Scale>
- __m128i gather(__m128i src, __m128i k, const unsigned *addr, __m128i idx)
- {
- return _mm_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
- }
- #endif
- }
- }
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SSE
- {
- using namespace SseIntrinsics;
- template <typename T> struct ParameterHelper
- {
- typedef T ByValue;
- typedef T &Reference;
- typedef const T &ConstRef;
- };
- template <typename T> struct VectorHelper
- {
- };
- template <typename T> struct VectorTypeHelper
- {
- typedef __m128i Type;
- };
- template <> struct VectorTypeHelper<double>
- {
- typedef __m128d Type;
- };
- template <> struct VectorTypeHelper<float>
- {
- typedef __m128 Type;
- };
- template <typename T> struct DetermineGatherMask
- {
- typedef T Type;
- };
- template <typename T> struct VectorTraits
- {
- typedef typename VectorTypeHelper<T>::Type VectorType;
- using EntryType = T;
- static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
- typedef Mask<T> MaskType;
- typedef typename DetermineGatherMask<MaskType>::Type GatherMaskType;
- typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
- };
- template <typename T> struct VectorHelperSize;
- }
- }
- #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
- #pragma GCC diagnostic pop
- #endif
- #ifndef VC_SSE_SHUFFLE_H_
- #define VC_SSE_SHUFFLE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- enum VecPos {
- X0, X1, X2, X3, X4, X5, X6, X7,
- Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7,
- Const0
- };
- namespace Mem
- {
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
- return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
- }
- template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
- static_assert(Dst0 >= X0 && Dst1 >= Y0, "Incorrect_Range");
- static_assert(Dst0 <= X1 && Dst1 <= Y1, "Incorrect_Range");
- return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
- }
- template <VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3>
- Vc_INTRINSIC Vc_CONST __m128i shuffle(__m128i x, __m128i y)
- {
- return _mm_castps_si128(shuffle<Dst0, Dst1, Dst2, Dst3>(_mm_castsi128_ps(x),
- _mm_castsi128_ps(y)));
- }
- template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
- static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
- static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
- return Vc::SseIntrinsics::blend_pd<(Dst0 / Y0) + (Dst1 / Y0) * 2>(x, y);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
- static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
- static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
- static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
- static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
- return Vc::SseIntrinsics::blend_ps<(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
- (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8>(x, y);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
- static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
- static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
- static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
- static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
- static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
- static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
- static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
- static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
- static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
- return Vc::SseIntrinsics::blend_epi16<
- (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
- (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 +
- (Dst7 / Y7) * 128>(x, y);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- }
- template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE Vc_CONST __m128d permute(__m128d x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
- return _mm_shuffle_pd(x, x, Dst0 + Dst1 * 4);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
- static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
- static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
- return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
- static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- static_assert(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, "Incorrect_Range");
- static_assert(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, "Incorrect_Range");
- if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
- x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- }
- if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
- x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
- }
- return x;
- }
- }
- namespace Reg
- {
- template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
- return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
- }
- template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
- return Mem::shuffle<Dst0, Dst1>(x, y);
- }
- template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- }
- template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
- return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
- }
- template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
- return Mem::blend<Dst0, Dst1>(x, y);
- }
- template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
- return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
- }
- }
- }
- #endif
- #endif
- #ifndef VC_SSE_VECTORHELPER_H_
- #define VC_SSE_VECTORHELPER_H_
- #include <limits>
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SSE
- {
- #define Vc_OP0(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
- #define Vc_OP1(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; }
- #define Vc_OP2(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; }
- #define Vc_OP3(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; }
- template<> struct VectorHelper<__m128>
- {
- typedef __m128 VectorType;
- template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_ps(x); }
- template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_ps(x); }
- template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_ps(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_ps(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_ps(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); }
- Vc_OP0(allone, _mm_setallone_ps())
- Vc_OP0(zero, _mm_setzero_ps())
- Vc_OP3(blend, blendv_ps(a, b, c))
- };
- template<> struct VectorHelper<__m128d>
- {
- typedef __m128d VectorType;
- template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_pd(x); }
- template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_pd(x); }
- template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_pd(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_pd(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_pd(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); }
- Vc_OP0(allone, _mm_setallone_pd())
- Vc_OP0(zero, _mm_setzero_pd())
- Vc_OP3(blend, blendv_pd(a, b, c))
- };
- template<> struct VectorHelper<__m128i>
- {
- typedef __m128i VectorType;
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_si128(reinterpret_cast<const VectorType *>(x)); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_si128(reinterpret_cast<const VectorType *>(x)); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_si128(reinterpret_cast<VectorType *>(mem), x); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_si128(reinterpret_cast<VectorType *>(mem), x); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_si128(reinterpret_cast<VectorType *>(mem), x); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); }
- Vc_OP0(allone, _mm_setallone_si128())
- Vc_OP0(zero, _mm_setzero_si128())
- Vc_OP3(blend, blendv_epi8(a, b, c))
- };
- #undef Vc_OP1
- #undef Vc_OP2
- #undef Vc_OP3
- #define Vc_OP1(op) \
- static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return Vc_CAT2(_mm_##op##_, Vc_SUFFIX)(a); }
- #define Vc_OP(op) \
- static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op##_ , Vc_SUFFIX)(a, b); }
- #define Vc_OP_(op) \
- static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op , Vc_SUFFIX)(a, b); }
- #define Vc_OPx(op,op2) \
- static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op2##_, Vc_SUFFIX)(a, b); }
- #define Vc_OP_CAST_(op) \
- static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_castps_, Vc_SUFFIX)( \
- _mm_##op##ps(Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(a), \
- Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(b))); \
- }
- #define Vc_MINMAX \
- static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return Vc_CAT2(_mm_min_, Vc_SUFFIX)(a, b); } \
- static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return Vc_CAT2(_mm_max_, Vc_SUFFIX)(a, b); }
- template<> struct VectorHelper<double> {
- typedef __m128d VectorType;
- typedef double EntryType;
- #define Vc_SUFFIX pd
- Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
- static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_pd(mask), a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
- #ifdef Vc_IMPL_FMA4
- static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
- v1 = _mm_macc_pd(v1, v2, v3);
- }
- #else
- static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
- VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
- VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
- #if defined(Vc_GCC) && Vc_GCC < 0x40703
- asm("":"+x"(h1), "+x"(h2));
- #endif
- const VectorType l1 = _mm_sub_pd(v1, h1);
- const VectorType l2 = _mm_sub_pd(v2, h2);
- const VectorType ll = mul(l1, l2);
- const VectorType lh = add(mul(l1, h2), mul(h1, l2));
- const VectorType hh = mul(h1, h2);
- const VectorType lh_lt_v3 = _mm_cmplt_pd(abs(lh), abs(v3));
- const VectorType b = blendv_pd(v3, lh, lh_lt_v3);
- const VectorType c = blendv_pd(lh, v3, lh_lt_v3);
- v1 = add(add(ll, b), add(c, hh));
- }
- #endif
- Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
- Vc_OP1(sqrt)
- static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) {
- return _mm_div_pd(one(), sqrt(x));
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
- return _mm_div_pd(one(), x);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
- return _mm_cmpunord_pd(x, x);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
- return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
- return _mm_castsi128_pd(cmpeq_epi64(_mm_castpd_si128(abs(x)), _mm_castpd_si128(_mm_load_pd(c_log<double>::d(1)))));
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
- return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_pd());
- }
- Vc_MINMAX
- static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
- a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
- return _mm_cvtsd_f64(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
- a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
- return _mm_cvtsd_f64(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
- a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
- return _mm_cvtsd_f64(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
- a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
- return _mm_cvtsd_f64(a);
- }
- #undef Vc_SUFFIX
- static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
- #ifdef Vc_IMPL_SSE4_1
- return _mm_round_pd(a, _MM_FROUND_NINT);
- #else
- return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
- #endif
- }
- };
- template<> struct VectorHelper<float> {
- typedef float EntryType;
- typedef __m128 VectorType;
- #define Vc_SUFFIX ps
- Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
- static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(mask, a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST __m128 concat(__m128d a, __m128d b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }
- #ifdef Vc_IMPL_FMA4
- static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
- v1 = _mm_macc_ps(v1, v2, v3);
- }
- #else
- static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
- __m128d v1_0 = _mm_cvtps_pd(v1);
- __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1));
- __m128d v2_0 = _mm_cvtps_pd(v2);
- __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2));
- __m128d v3_0 = _mm_cvtps_pd(v3);
- __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3));
- v1 = _mm_movelh_ps(
- _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)),
- _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1)));
- }
- #endif
- Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
- Vc_OP1(sqrt) Vc_OP1(rsqrt)
- static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
- return _mm_cmpunord_ps(x, x);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
- return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
- return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(abs(x)), _mm_castps_si128(_mm_load_ps(c_log<float>::d(1)))));
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
- return _mm_rcp_ps(x);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
- return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_ps());
- }
- Vc_MINMAX
- static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
- a = _mm_min_ps(a, _mm_movehl_ps(a, a));
- a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtss_f32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
- a = _mm_max_ps(a, _mm_movehl_ps(a, a));
- a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtss_f32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
- a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
- a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
- return _mm_cvtss_f32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
- a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
- a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
- return _mm_cvtss_f32(a);
- }
- #undef Vc_SUFFIX
- static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
- #ifdef Vc_IMPL_SSE4_1
- return _mm_round_ps(a, _MM_FROUND_NINT);
- #else
- return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
- #endif
- }
- };
- template<> struct VectorHelper<int> {
- typedef int EntryType;
- typedef __m128i VectorType;
- #define Vc_SUFFIX si128
- Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
- static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
- #undef Vc_SUFFIX
- #define Vc_SUFFIX epi32
- static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
- static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
- return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
- return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi32(a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epi32(a, b); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epi32(a, b); }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
- a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
- a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
- a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- return _mm_cvtsi128_si32(a);
- }
- #ifdef Vc_IMPL_SSE4_1
- static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
- a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- return _mm_cvtsi128_si32(a);
- }
- #else
- static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
- const VectorType aShift = _mm_srli_si128(a, 4);
- const VectorType ab02 = _mm_mul_epu32(a, b);
- const VectorType bShift = _mm_srli_si128(b, 4);
- const VectorType ab13 = _mm_mul_epu32(aShift, bShift);
- return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
- }
- #endif
- Vc_OP(add) Vc_OP(sub)
- #undef Vc_SUFFIX
- static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
- };
- template<> struct VectorHelper<unsigned int> {
- typedef unsigned int EntryType;
- typedef __m128i VectorType;
- #define Vc_SUFFIX si128
- Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
- static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
- #undef Vc_SUFFIX
- #define Vc_SUFFIX epu32
- static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu32(a, b); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu32(a, b); }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
- a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
- a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
- a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
- a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
- return VectorHelper<int>::mul(a, b);
- }
- #undef Vc_SUFFIX
- #define Vc_SUFFIX epi32
- static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
- return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
- return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
- Vc_OP(add) Vc_OP(sub)
- #undef Vc_SUFFIX
- static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
- };
- template<> struct VectorHelper<signed short> {
- typedef __m128i VectorType;
- typedef signed short EntryType;
- #define Vc_SUFFIX si128
- Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
- static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
- static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); }
- static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
- static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }
- #undef Vc_SUFFIX
- #define Vc_SUFFIX epi16
- static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
- return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
- return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
- const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
- return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
- }
- static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
- v1 = add(mul(v1, v2), v3); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi16(a); }
- Vc_OPx(mul, mullo)
- Vc_OP(min) Vc_OP(max)
- static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
- a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
- a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
- a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
- a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtsi128_si32(a);
- }
- Vc_OP(add) Vc_OP(sub)
- #undef Vc_SUFFIX
- static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
- };
- template<> struct VectorHelper<unsigned short> {
- typedef __m128i VectorType;
- typedef unsigned short EntryType;
- #define Vc_SUFFIX si128
- Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
- static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
- #ifdef Vc_IMPL_SSE4_1
- static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }
- #else
- static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) {
- auto tmp0 = _mm_unpacklo_epi16(a, b);
- auto tmp1 = _mm_unpackhi_epi16(a, b);
- auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
- auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
- return _mm_unpacklo_epi16(tmp2, tmp3);
- }
- #endif
- static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_unpacklo_epi16(x, _mm_setzero_si128()); }
- static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }
- #undef Vc_SUFFIX
- #define Vc_SUFFIX epu16
- static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
- #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1
- static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu16(a, b); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu16(a, b); }
- #endif
- #undef Vc_SUFFIX
- #define Vc_SUFFIX epi16
- static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
- return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
- return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
- }
- static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
- Vc_OPx(mul, mullo)
- #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1)
- Vc_OP(min) Vc_OP(max)
- #endif
- static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
- a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
- a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
- a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
- a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
- a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtsi128_si32(a);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c,
- const EntryType d, const EntryType e, const EntryType f,
- const EntryType g, const EntryType h) {
- return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
- }
- Vc_OP(add) Vc_OP(sub)
- #undef Vc_SUFFIX
- static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
- };
- #undef Vc_OP1
- #undef Vc_OP
- #undef Vc_OP_
- #undef Vc_OPx
- #undef Vc_OP_CAST_
- #undef Vc_MINMAX
- }
- }
- #endif
- #ifndef VC_SSE_MASK_H_
- #define VC_SSE_MASK_H_
- #ifndef VC_SSE_DETAIL_H_
- #define VC_SSE_DETAIL_H_
- #ifndef VC_SSE_CASTS_H_
- #define VC_SSE_CASTS_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SSE
- {
- using uint = unsigned int;
- using ushort = unsigned short;
- using uchar = unsigned char;
- using schar = signed char;
- template <typename To, typename From> Vc_ALWAYS_INLINE Vc_CONST To sse_cast(From v)
- {
- return v;
- }
- template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128 >(__m128 v) { return _mm_castps_si128(v); }
- template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128d>(__m128d v) { return _mm_castpd_si128(v); }
- template<> Vc_ALWAYS_INLINE Vc_CONST __m128 sse_cast<__m128 , __m128d>(__m128d v) { return _mm_castpd_ps(v); }
- template<> Vc_ALWAYS_INLINE Vc_CONST __m128 sse_cast<__m128 , __m128i>(__m128i v) { return _mm_castsi128_ps(v); }
- template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128i>(__m128i v) { return _mm_castsi128_pd(v); }
- template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128 >(__m128 v) { return _mm_castps_pd(v); }
- template <typename From, typename To> struct ConvertTag
- {
- };
- template <typename From, typename To>
- Vc_INTRINSIC typename VectorTraits<To>::VectorType convert(
- typename VectorTraits<From>::VectorType v)
- {
- return convert(v, ConvertTag<From, To>());
- }
- Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , int >) { return _mm_cvttps_epi32(v); }
- Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, int >) { return _mm_cvttpd_epi32(v); }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , int >) { return v; }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , int >) { return v; }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , int >) {
- #ifdef Vc_IMPL_SSE4_1
- return _mm_cvtepi16_epi32(v);
- #else
- return _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16);
- #endif
- }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, int >) {
- #ifdef Vc_IMPL_SSE4_1
- return _mm_cvtepu16_epi32(v);
- #else
- return _mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16);
- #endif
- }
- Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , uint >) {
- return _mm_castps_si128(
- blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(v)),
- _mm_castsi128_ps(_mm_xor_si128(
- _mm_cvttps_epi32(_mm_sub_ps(v, _mm_set1_ps(1u << 31))),
- _mm_set1_epi32(1 << 31))),
- _mm_cmpge_ps(v, _mm_set1_ps(1u << 31))));
- }
- Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, uint >) {
- #ifdef Vc_IMPL_SSE4_1
- return _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(_mm_floor_pd(v), _mm_set1_pd(0x80000000u))),
- _mm_cvtsi64_si128(0x8000000080000000ull));
- #else
- return blendv_epi8(_mm_cvttpd_epi32(v),
- _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(v, _mm_set1_pd(0x80000000u))),
- _mm_cvtsi64_si128(0x8000000080000000ull)),
- _mm_castpd_si128(_mm_cmpge_pd(v, _mm_set1_pd(0x80000000u))));
- #endif
- }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , uint >) { return v; }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , uint >) { return v; }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , uint >) { return convert(v, ConvertTag<short, int>()); }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, uint >) { return convert(v, ConvertTag<ushort, int>()); }
- Vc_INTRINSIC __m128 convert(__m128 v, ConvertTag<float , float >) { return v; }
- Vc_INTRINSIC __m128 convert(__m128d v, ConvertTag<double, float >) { return _mm_cvtpd_ps(v); }
- Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<int , float >) { return _mm_cvtepi32_ps(v); }
- Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<uint , float >) {
- using namespace SSE;
- return blendv_ps(_mm_cvtepi32_ps(v),
- _mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(v, _mm_set1_epi32(0x7ffffe00))),
- _mm_add_ps(_mm_set1_ps(1u << 31), _mm_cvtepi32_ps(_mm_and_si128(
- v, _mm_set1_epi32(0x000001ff))))),
- _mm_castsi128_ps(_mm_cmplt_epi32(v, _mm_setzero_si128())));
- }
- Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<short , float >) { return convert(convert(v, ConvertTag<short, int>()), ConvertTag<int, float>()); }
- Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<ushort, float >) { return convert(convert(v, ConvertTag<ushort, int>()), ConvertTag<int, float>()); }
- Vc_INTRINSIC __m128d convert(__m128 v, ConvertTag<float , double>) { return _mm_cvtps_pd(v); }
- Vc_INTRINSIC __m128d convert(__m128d v, ConvertTag<double, double>) { return v; }
- Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<int , double>) { return _mm_cvtepi32_pd(v); }
- Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<uint , double>) { return _mm_add_pd(_mm_cvtepi32_pd(_mm_xor_si128(v, setmin_epi32())), _mm_set1_pd(1u << 31)); }
- Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, ConvertTag<short, int>()), ConvertTag<int, double>()); }
- Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
- Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , short >) { return _mm_packs_epi32(_mm_cvttps_epi32(v), _mm_setzero_si128()); }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , short >) { return _mm_packs_epi32(v, _mm_setzero_si128()); }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , short >) { return _mm_packs_epi32(v, _mm_setzero_si128()); }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , short >) { return v; }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, short >) { return v; }
- Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, short >) { return convert(convert(v, ConvertTag<double, int>()), ConvertTag<int, short>()); }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , ushort>) {
- auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128());
- auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128());
- auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
- auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
- return _mm_unpacklo_epi16(tmp2, tmp3);
- }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , ushort>) {
- auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128());
- auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128());
- auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
- auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
- return _mm_unpacklo_epi16(tmp2, tmp3);
- }
- Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , ushort>) { return convert(_mm_cvttps_epi32(v), ConvertTag<int, ushort>()); }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , ushort>) { return v; }
- Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, ushort>) { return v; }
- Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, int>()), ConvertTag<int, ushort>()); }
- }
- }
- #endif
- #ifdef Vc_IMPL_AVX
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <typename V, typename DstT> struct LoadTag
- {
- };
- class when_aligned
- {
- public:
- template <typename F> constexpr when_aligned(F, typename F::EnableIfAligned = nullptr)
- {
- }
- };
- class when_unaligned
- {
- public:
- template <typename F>
- constexpr when_unaligned(F, typename F::EnableIfUnaligned = nullptr)
- {
- }
- };
- class when_streaming
- {
- public:
- template <typename F>
- constexpr when_streaming(F, typename F::EnableIfStreaming = nullptr)
- {
- }
- };
- Vc_INTRINSIC __m128 load16(const float *mem, when_aligned)
- {
- return _mm_load_ps(mem);
- }
- Vc_INTRINSIC __m128 load16(const float *mem, when_unaligned)
- {
- return _mm_loadu_ps(mem);
- }
- Vc_INTRINSIC __m128 load16(const float *mem, when_streaming)
- {
- return SseIntrinsics::_mm_stream_load(mem);
- }
- Vc_INTRINSIC __m128d load16(const double *mem, when_aligned)
- {
- return _mm_load_pd(mem);
- }
- Vc_INTRINSIC __m128d load16(const double *mem, when_unaligned)
- {
- return _mm_loadu_pd(mem);
- }
- Vc_INTRINSIC __m128d load16(const double *mem, when_streaming)
- {
- return SseIntrinsics::_mm_stream_load(mem);
- }
- template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_aligned)
- {
- static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
- return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
- }
- template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_unaligned)
- {
- static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
- return _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem));
- }
- template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_streaming)
- {
- static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
- return SseIntrinsics::_mm_stream_load(mem);
- }
- #ifdef Vc_MSVC
- template <typename V, typename DstT, typename F>
- Vc_INTRINSIC __m128d load(const double *mem, F f,
- enable_if<(std::is_same<DstT, double>::value &&
- std::is_same<V, __m128d>::value)> = nullarg)
- {
- return load16(mem, f);
- }
- template <typename V, typename DstT, typename F>
- Vc_INTRINSIC __m128 load(const float *mem, F f,
- enable_if<(std::is_same<DstT, float>::value &&
- std::is_same<V, __m128>::value)> = nullarg)
- {
- return load16(mem, f);
- }
- template <typename V, typename DstT, typename F>
- Vc_INTRINSIC __m128i load(const uint *mem, F f,
- enable_if<(std::is_same<DstT, uint>::value &&
- std::is_same<V, __m128i>::value)> = nullarg)
- {
- return load16(mem, f);
- }
- template <typename V, typename DstT, typename F>
- Vc_INTRINSIC __m128i load(const int *mem, F f,
- enable_if<(std::is_same<DstT, int>::value &&
- std::is_same<V, __m128i>::value)> = nullarg)
- {
- return load16(mem, f);
- }
- template <typename V, typename DstT, typename F>
- Vc_INTRINSIC __m128i load(const short *mem, F f,
- enable_if<(std::is_same<DstT, short>::value &&
- std::is_same<V, __m128i>::value)> = nullarg)
- {
- return load16(mem, f);
- }
- template <typename V, typename DstT, typename F>
- Vc_INTRINSIC __m128i load(const ushort *mem, F f,
- enable_if<(std::is_same<DstT, ushort>::value &&
- std::is_same<V, __m128i>::value)> = nullarg)
- {
- return load16(mem, f);
- }
- #endif
- template <typename V, typename DstT, typename SrcT, typename Flags,
- typename = enable_if<
- #ifdef Vc_MSVC
- !std::is_same<DstT, SrcT>::value &&
- #endif
- (!std::is_integral<DstT>::value || !std::is_integral<SrcT>::value ||
- sizeof(DstT) >= sizeof(SrcT))>>
- Vc_INTRINSIC V load(const SrcT *mem, Flags flags)
- {
- return load(mem, flags, LoadTag<V, DstT>());
- }
- template <typename V, typename T, typename Flags>
- Vc_INTRINSIC V
- load(const T *mem, Flags, LoadTag<V, T>, enable_if<sizeof(V) == 16> = nullarg)
- {
- return SSE::VectorHelper<V>::template load<Flags>(mem);
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, short>)
- {
- return SSE::VectorHelper<__m128i>::load<Flags>(mem);
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, short>)
- {
- return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, short>)
- {
- return SSE::cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, ushort>)
- {
- return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const uint *mem, Flags, LoadTag<__m128i, int>)
- {
- return SSE::VectorHelper<__m128i>::load<Flags>(mem);
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, int>)
- {
- return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const short *mem, Flags, LoadTag<__m128i, int>)
- {
- return SSE::cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, int>)
- {
- return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, int>)
- {
- return SSE::cvtepi8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, uint>)
- {
- return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, uint>)
- {
- return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128d load(const float *mem, Flags, LoadTag<__m128d, double>)
- {
- return SSE::convert<float, double>(
- _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64 *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128d load(const uint *mem, Flags, LoadTag<__m128d, double>)
- {
- return SSE::convert<uint, double>(
- _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128d load(const int *mem, Flags, LoadTag<__m128d, double>)
- {
- return SSE::convert<int, double>(
- _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128d load(const ushort *mem, Flags, LoadTag<__m128d, double>)
- {
- return SSE::convert<ushort, double>(
- _mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128d load(const short *mem, Flags, LoadTag<__m128d, double>)
- {
- return SSE::convert<short, double>(
- _mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128d load(const uchar *mem, Flags, LoadTag<__m128d, double>)
- {
- return SSE::convert<uchar, double>(
- _mm_set1_epi16(*aliasing_cast<short>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128d load(const schar *mem, Flags, LoadTag<__m128d, double>)
- {
- return SSE::convert<char, double>(
- _mm_set1_epi16(*aliasing_cast<short>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m128 load(const double *mem, Flags, LoadTag<__m128, float>)
- {
- #ifdef Vc_IMPL_AVX
- if (Flags::IsUnaligned) {
- return _mm256_cvtpd_ps(_mm256_loadu_pd(mem));
- } else if (Flags::IsStreaming) {
- return _mm256_cvtpd_ps(AvxIntrinsics::stream_load<__m256d>(mem));
- } else {
- return _mm256_cvtpd_ps(_mm256_load_pd(mem));
- }
- #else
- return _mm_movelh_ps(_mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[0])),
- _mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[2])));
- #endif
- }
- template <typename Flags>
- Vc_INTRINSIC __m128 load(const uint *mem, Flags f, LoadTag<__m128, float>)
- {
- return SSE::convert<uint, float>(load<__m128i, uint>(mem, f));
- }
- template <typename T, typename Flags,
- typename = enable_if<!std::is_same<T, float>::value>>
- Vc_INTRINSIC __m128 load(const T *mem, Flags f, LoadTag<__m128, float>)
- {
- return _mm_cvtepi32_ps(load<__m128i, int>(mem, f));
- }
- template <int amount, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<amount == 0, T> shifted(T k)
- {
- return k;
- }
- template <int amount, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount > 0), T> shifted(T k)
- {
- return _mm_srli_si128(k, amount);
- }
- template <int amount, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount < 0), T> shifted(T k)
- {
- return _mm_slli_si128(k, -amount);
- }
- template <typename T, int Size> Vc_INTRINSIC Vc_CONST const T *IndexesFromZero()
- {
- if (Size == 4) {
- return reinterpret_cast<const T *>(SSE::_IndexesFromZero4);
- } else if (Size == 8) {
- return reinterpret_cast<const T *>(SSE::_IndexesFromZero8);
- } else if (Size == 16) {
- return reinterpret_cast<const T *>(SSE::_IndexesFromZero16);
- }
- return 0;
- }
- Vc_INTRINSIC Vc_CONST unsigned int popcnt4(unsigned int n)
- {
- #ifdef Vc_IMPL_POPCNT
- return _mm_popcnt_u32(n);
- #else
- n = (n & 0x5U) + ((n >> 1) & 0x5U);
- n = (n & 0x3U) + ((n >> 2) & 0x3U);
- return n;
- #endif
- }
- Vc_INTRINSIC Vc_CONST unsigned int popcnt8(unsigned int n)
- {
- #ifdef Vc_IMPL_POPCNT
- return _mm_popcnt_u32(n);
- #else
- n = (n & 0x55U) + ((n >> 1) & 0x55U);
- n = (n & 0x33U) + ((n >> 2) & 0x33U);
- n = (n & 0x0fU) + ((n >> 4) & 0x0fU);
- return n;
- #endif
- }
- Vc_INTRINSIC Vc_CONST unsigned int popcnt16(unsigned int n)
- {
- #ifdef Vc_IMPL_POPCNT
- return _mm_popcnt_u32(n);
- #else
- n = (n & 0x5555U) + ((n >> 1) & 0x5555U);
- n = (n & 0x3333U) + ((n >> 2) & 0x3333U);
- n = (n & 0x0f0fU) + ((n >> 4) & 0x0f0fU);
- n = (n & 0x00ffU) + ((n >> 8) & 0x00ffU);
- return n;
- #endif
- }
- Vc_INTRINSIC Vc_CONST unsigned int popcnt32(unsigned int n)
- {
- #ifdef Vc_IMPL_POPCNT
- return _mm_popcnt_u32(n);
- #else
- n = (n & 0x55555555U) + ((n >> 1) & 0x55555555U);
- n = (n & 0x33333333U) + ((n >> 2) & 0x33333333U);
- n = (n & 0x0f0f0f0fU) + ((n >> 4) & 0x0f0f0f0fU);
- n = (n & 0x00ff00ffU) + ((n >> 8) & 0x00ff00ffU);
- n = (n & 0x0000ffffU) + ((n >>16) & 0x0000ffffU);
- return n;
- #endif
- }
- template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m128i k)
- {
- static_assert(From == To, "Incorrect mask cast.");
- static_assert(std::is_same<R, __m128>::value, "Incorrect mask cast.");
- return SSE::sse_cast<__m128>(k);
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 4, __m128>(__m128i k)
- {
- return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 8, __m128>(__m128i k)
- {
- return SSE::sse_cast<__m128>(
- _mm_packs_epi16(_mm_packs_epi16(k, _mm_setzero_si128()), _mm_setzero_si128()));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 2, __m128>(__m128i k)
- {
- return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(k, k));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m128i k)
- {
- return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 2, __m128>(__m128i k)
- {
- const auto tmp = _mm_unpacklo_epi16(k, k);
- return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m128i k)
- {
- return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 8, __m128>(__m128i k)
- {
- return SSE::sse_cast<__m128>(_mm_unpacklo_epi8(k, k));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 4, __m128>(__m128i k)
- {
- const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 8, __m128>(k));
- return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(tmp, tmp));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 2, __m128>(__m128i k)
- {
- const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 4, __m128>(k));
- return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
- }
- template <typename V> Vc_INTRINSIC_L Vc_CONST_L V allone() Vc_INTRINSIC_R Vc_CONST_R;
- template<> Vc_INTRINSIC Vc_CONST __m128 allone<__m128 >() { return SSE::_mm_setallone_ps(); }
- template<> Vc_INTRINSIC Vc_CONST __m128i allone<__m128i>() { return SSE::_mm_setallone_si128(); }
- template<> Vc_INTRINSIC Vc_CONST __m128d allone<__m128d>() { return SSE::_mm_setallone_pd(); }
- template <typename V> inline V zero();
- template<> Vc_INTRINSIC Vc_CONST __m128 zero<__m128 >() { return _mm_setzero_ps(); }
- template<> Vc_INTRINSIC Vc_CONST __m128i zero<__m128i>() { return _mm_setzero_si128(); }
- template<> Vc_INTRINSIC Vc_CONST __m128d zero<__m128d>() { return _mm_setzero_pd(); }
- Vc_ALWAYS_INLINE Vc_CONST __m128 negate(__m128 v, std::integral_constant<std::size_t, 4>)
- {
- return _mm_xor_ps(v, SSE::_mm_setsignmask_ps());
- }
- Vc_ALWAYS_INLINE Vc_CONST __m128d negate(__m128d v, std::integral_constant<std::size_t, 8>)
- {
- return _mm_xor_pd(v, SSE::_mm_setsignmask_pd());
- }
- Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 4>)
- {
- #ifdef Vc_IMPL_SSSE3
- return _mm_sign_epi32(v, allone<__m128i>());
- #else
- return _mm_sub_epi32(_mm_setzero_si128(), v);
- #endif
- }
- Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 2>)
- {
- #ifdef Vc_IMPL_SSSE3
- return _mm_sign_epi16(v, allone<__m128i>());
- #else
- return _mm_sub_epi16(_mm_setzero_si128(), v);
- #endif
- }
- Vc_INTRINSIC __m128 xor_(__m128 a, __m128 b) { return _mm_xor_ps(a, b); }
- Vc_INTRINSIC __m128d xor_(__m128d a, __m128d b) { return _mm_xor_pd(a, b); }
- Vc_INTRINSIC __m128i xor_(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
- Vc_INTRINSIC __m128 or_(__m128 a, __m128 b) { return _mm_or_ps(a, b); }
- Vc_INTRINSIC __m128d or_(__m128d a, __m128d b) { return _mm_or_pd(a, b); }
- Vc_INTRINSIC __m128i or_(__m128i a, __m128i b) { return _mm_or_si128(a, b); }
- Vc_INTRINSIC __m128 and_(__m128 a, __m128 b) { return _mm_and_ps(a, b); }
- Vc_INTRINSIC __m128d and_(__m128d a, __m128d b) { return _mm_and_pd(a, b); }
- Vc_INTRINSIC __m128i and_(__m128i a, __m128i b) { return _mm_and_si128(a, b); }
- Vc_INTRINSIC __m128 andnot_(__m128 a, __m128 b) { return _mm_andnot_ps(a, b); }
- Vc_INTRINSIC __m128d andnot_(__m128d a, __m128d b) { return _mm_andnot_pd(a, b); }
- Vc_INTRINSIC __m128i andnot_(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); }
- Vc_INTRINSIC __m128 not_(__m128 a) { return andnot_(a, allone<__m128 >()); }
- Vc_INTRINSIC __m128d not_(__m128d a) { return andnot_(a, allone<__m128d>()); }
- Vc_INTRINSIC __m128i not_(__m128i a) { return andnot_(a, allone<__m128i>()); }
- Vc_INTRINSIC __m128 add(__m128 a, __m128 b, float) { return _mm_add_ps(a, b); }
- Vc_INTRINSIC __m128d add(__m128d a, __m128d b, double) { return _mm_add_pd(a, b); }
- Vc_INTRINSIC __m128i add(__m128i a, __m128i b, int) { return _mm_add_epi32(a, b); }
- Vc_INTRINSIC __m128i add(__m128i a, __m128i b, uint) { return _mm_add_epi32(a, b); }
- Vc_INTRINSIC __m128i add(__m128i a, __m128i b, short) { return _mm_add_epi16(a, b); }
- Vc_INTRINSIC __m128i add(__m128i a, __m128i b, ushort) { return _mm_add_epi16(a, b); }
- Vc_INTRINSIC __m128i add(__m128i a, __m128i b, schar) { return _mm_add_epi8 (a, b); }
- Vc_INTRINSIC __m128i add(__m128i a, __m128i b, uchar) { return _mm_add_epi8 (a, b); }
- Vc_INTRINSIC __m128 sub(__m128 a, __m128 b, float) { return _mm_sub_ps(a, b); }
- Vc_INTRINSIC __m128d sub(__m128d a, __m128d b, double) { return _mm_sub_pd(a, b); }
- Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, int) { return _mm_sub_epi32(a, b); }
- Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, uint) { return _mm_sub_epi32(a, b); }
- Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, short) { return _mm_sub_epi16(a, b); }
- Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, ushort) { return _mm_sub_epi16(a, b); }
- Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, schar) { return _mm_sub_epi8 (a, b); }
- Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, uchar) { return _mm_sub_epi8 (a, b); }
- Vc_INTRINSIC __m128 mul(__m128 a, __m128 b, float) { return _mm_mul_ps(a, b); }
- Vc_INTRINSIC __m128d mul(__m128d a, __m128d b, double) { return _mm_mul_pd(a, b); }
- Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, int) {
- #ifdef Vc_IMPL_SSE4_1
- return _mm_mullo_epi32(a, b);
- #else
- const __m128i aShift = _mm_srli_si128(a, 4);
- const __m128i ab02 = _mm_mul_epu32(a, b);
- const __m128i bShift = _mm_srli_si128(b, 4);
- const __m128i ab13 = _mm_mul_epu32(aShift, bShift);
- return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
- #endif
- }
- Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, uint) { return mul(a, b, int()); }
- Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, short) { return _mm_mullo_epi16(a, b); }
- Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, ushort) { return _mm_mullo_epi16(a, b); }
- Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, schar) {
- #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
- using B = Common::BuiltinType<schar, 16>;
- const auto x = aliasing_cast<B>(a) * aliasing_cast<B>(b);
- return reinterpret_cast<const __m128i &>(x);
- #else
- return or_(
- and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
- _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
- #endif
- }
- Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, uchar) {
- #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
- using B = Common::BuiltinType<uchar, 16>;
- const auto x = aliasing_cast<B>(a) * aliasing_cast<B>(b);
- return reinterpret_cast<const __m128i &>(x);
- #else
- return or_(
- and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
- _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
- #endif
- }
- Vc_INTRINSIC __m128 div(__m128 a, __m128 b, float) { return _mm_div_ps(a, b); }
- Vc_INTRINSIC __m128d div(__m128d a, __m128d b, double) { return _mm_div_pd(a, b); }
- Vc_INTRINSIC __m128 min(__m128 a, __m128 b, float) { return _mm_min_ps(a, b); }
- Vc_INTRINSIC __m128d min(__m128d a, __m128d b, double) { return _mm_min_pd(a, b); }
- Vc_INTRINSIC __m128i min(__m128i a, __m128i b, int) { return SSE::min_epi32(a, b); }
- Vc_INTRINSIC __m128i min(__m128i a, __m128i b, uint) { return SSE::min_epu32(a, b); }
- Vc_INTRINSIC __m128i min(__m128i a, __m128i b, short) { return _mm_min_epi16(a, b); }
- Vc_INTRINSIC __m128i min(__m128i a, __m128i b, ushort) { return SSE::min_epu16(a, b); }
- Vc_INTRINSIC __m128i min(__m128i a, __m128i b, schar) { return SSE::min_epi8 (a, b); }
- Vc_INTRINSIC __m128i min(__m128i a, __m128i b, uchar) { return _mm_min_epu8 (a, b); }
- Vc_INTRINSIC __m128 max(__m128 a, __m128 b, float) { return _mm_max_ps(a, b); }
- Vc_INTRINSIC __m128d max(__m128d a, __m128d b, double) { return _mm_max_pd(a, b); }
- Vc_INTRINSIC __m128i max(__m128i a, __m128i b, int) { return SSE::max_epi32(a, b); }
- Vc_INTRINSIC __m128i max(__m128i a, __m128i b, uint) { return SSE::max_epu32(a, b); }
- Vc_INTRINSIC __m128i max(__m128i a, __m128i b, short) { return _mm_max_epi16(a, b); }
- Vc_INTRINSIC __m128i max(__m128i a, __m128i b, ushort) { return SSE::max_epu16(a, b); }
- Vc_INTRINSIC __m128i max(__m128i a, __m128i b, schar) { return SSE::max_epi8 (a, b); }
- Vc_INTRINSIC __m128i max(__m128i a, __m128i b, uchar) { return _mm_max_epu8 (a, b); }
- Vc_INTRINSIC float add(__m128 a, float) {
- a = _mm_add_ps(a, _mm_movehl_ps(a, a));
- a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtss_f32(a);
- }
- Vc_INTRINSIC double add(__m128d a, double) {
- a = _mm_add_sd(a, _mm_unpackhi_pd(a, a));
- return _mm_cvtsd_f64(a);
- }
- Vc_INTRINSIC int add(__m128i a, int) {
- a = add(a, _mm_srli_si128(a, 8), int());
- a = add(a, _mm_srli_si128(a, 4), int());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC uint add(__m128i a, uint) { return add(a, int()); }
- Vc_INTRINSIC short add(__m128i a, short) {
- a = add(a, _mm_srli_si128(a, 8), short());
- a = add(a, _mm_srli_si128(a, 4), short());
- a = add(a, _mm_srli_si128(a, 2), short());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC ushort add(__m128i a, ushort) { return add(a, short()); }
- Vc_INTRINSIC schar add(__m128i a, schar) {
- a = add(a, _mm_srli_si128(a, 8), schar());
- a = add(a, _mm_srli_si128(a, 4), schar());
- a = add(a, _mm_srli_si128(a, 2), schar());
- a = add(a, _mm_srli_si128(a, 1), schar());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC uchar add(__m128i a, uchar) { return add(a, schar()); }
- Vc_INTRINSIC float mul(__m128 a, float) {
- a = _mm_mul_ps(a, _mm_movehl_ps(a, a));
- a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtss_f32(a);
- }
- Vc_INTRINSIC double mul(__m128d a, double) {
- a = _mm_mul_sd(a, _mm_unpackhi_pd(a, a));
- return _mm_cvtsd_f64(a);
- }
- Vc_INTRINSIC int mul(__m128i a, int) {
- a = mul(a, _mm_srli_si128(a, 8), int());
- a = mul(a, _mm_srli_si128(a, 4), int());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC uint mul(__m128i a, uint) { return mul(a, int()); }
- Vc_INTRINSIC short mul(__m128i a, short) {
- a = mul(a, _mm_srli_si128(a, 8), short());
- a = mul(a, _mm_srli_si128(a, 4), short());
- a = mul(a, _mm_srli_si128(a, 2), short());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC ushort mul(__m128i a, ushort) { return mul(a, short()); }
- Vc_INTRINSIC schar mul(__m128i a, schar) {
- const __m128i s0 = _mm_srai_epi16(a, 1);
- const __m128i s1 = Detail::and_(a, _mm_set1_epi32(0x0f0f0f0f));
- return mul(mul(s0, s1, short()), short());
- }
- Vc_INTRINSIC uchar mul(__m128i a, uchar) { return mul(a, schar()); }
- Vc_INTRINSIC float min(__m128 a, float) {
- a = _mm_min_ps(a, _mm_movehl_ps(a, a));
- a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtss_f32(a);
- }
- Vc_INTRINSIC double min(__m128d a, double) {
- a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
- return _mm_cvtsd_f64(a);
- }
- Vc_INTRINSIC int min(__m128i a, int) {
- a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC uint min(__m128i a, uint) {
- a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC short min(__m128i a, short) {
- a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC ushort min(__m128i a, ushort) {
- a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC schar min(__m128i a, schar) {
- a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
- return std::min(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
- }
- Vc_INTRINSIC uchar min(__m128i a, uchar) {
- a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
- a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
- return std::min((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
- }
- Vc_INTRINSIC float max(__m128 a, float) {
- a = _mm_max_ps(a, _mm_movehl_ps(a, a));
- a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtss_f32(a);
- }
- Vc_INTRINSIC double max(__m128d a, double) {
- a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
- return _mm_cvtsd_f64(a);
- }
- Vc_INTRINSIC int max(__m128i a, int) {
- a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC uint max(__m128i a, uint) {
- a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC short max(__m128i a, short) {
- a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC ushort max(__m128i a, ushort) {
- a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
- return _mm_cvtsi128_si32(a);
- }
- Vc_INTRINSIC schar max(__m128i a, schar) {
- a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
- return std::max(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
- }
- Vc_INTRINSIC uchar max(__m128i a, uchar) {
- a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
- a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
- return std::max((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
- }
- template <Vc::Implementation, typename T>
- Vc_CONST_L SSE::Vector<T> sorted(SSE::Vector<T> x) Vc_CONST_R;
- template <typename T> Vc_INTRINSIC Vc_CONST SSE::Vector<T> sorted(SSE::Vector<T> x)
- {
- static_assert(!CurrentImplementation::is(ScalarImpl),
- "Detail::sorted can only be instantiated if a non-Scalar "
- "implementation is selected.");
- return sorted < CurrentImplementation::is_between(SSE2Impl, SSSE3Impl)
- ? SSE2Impl
- : CurrentImplementation::is_between(SSE41Impl, SSE42Impl)
- ? SSE41Impl
- : CurrentImplementation::current() > (x);
- }
- template <typename V> constexpr int sanitize(int n)
- {
- return (n >= int(sizeof(V)) || n <= -int(sizeof(V))) ? 0 : n;
- }
- template <typename T, size_t N, typename V>
- static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> rotated(V v, int amount)
- {
- using namespace SSE;
- switch (static_cast<unsigned int>(amount) % N) {
- case 0:
- return v;
- case 1:
- return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(1 * sizeof(T))));
- case 2:
- return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(2 * sizeof(T))));
- case 3:
- return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(3 * sizeof(T))));
- case 4:
- return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(4 * sizeof(T))));
- case 5:
- return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(5 * sizeof(T))));
- case 6:
- return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(6 * sizeof(T))));
- case 7:
- return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(7 * sizeof(T))));
- }
- return sse_cast<V>(_mm_setzero_si128());
- }
- template<typename V, size_t Size, size_t VSize> struct InterleaveImpl;
- template<typename V> struct InterleaveImpl<V, 8, 16> {
- template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1)
- {
- const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
- const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
- #ifdef __x86_64__
- const long long tmp00 = _mm_cvtsi128_si64(tmp0);
- const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0));
- const long long tmp10 = _mm_cvtsi128_si64(tmp1);
- const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1));
- aliasing_cast<int>(data[i[0]]) = tmp00;
- aliasing_cast<int>(data[i[1]]) = tmp00 >> 32;
- aliasing_cast<int>(data[i[2]]) = tmp01;
- aliasing_cast<int>(data[i[3]]) = tmp01 >> 32;
- aliasing_cast<int>(data[i[4]]) = tmp10;
- aliasing_cast<int>(data[i[5]]) = tmp10 >> 32;
- aliasing_cast<int>(data[i[6]]) = tmp11;
- aliasing_cast<int>(data[i[7]]) = tmp11 >> 32;
- #elif defined(Vc_IMPL_SSE4_1)
- using namespace SseIntrinsics;
- aliasing_cast<int>(data[i[0]]) = _mm_cvtsi128_si32(tmp0);
- aliasing_cast<int>(data[i[1]]) = extract_epi32<1>(tmp0);
- aliasing_cast<int>(data[i[2]]) = extract_epi32<2>(tmp0);
- aliasing_cast<int>(data[i[3]]) = extract_epi32<3>(tmp0);
- aliasing_cast<int>(data[i[4]]) = _mm_cvtsi128_si32(tmp1);
- aliasing_cast<int>(data[i[5]]) = extract_epi32<1>(tmp1);
- aliasing_cast<int>(data[i[6]]) = extract_epi32<2>(tmp1);
- aliasing_cast<int>(data[i[7]]) = extract_epi32<3>(tmp1);
- #else
- aliasing_cast<int>(data[i[0]]) = _mm_cvtsi128_si32(tmp0);
- aliasing_cast<int>(data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4));
- aliasing_cast<int>(data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8));
- aliasing_cast<int>(data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12));
- aliasing_cast<int>(data[i[4]]) = _mm_cvtsi128_si32(tmp1);
- aliasing_cast<int>(data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4));
- aliasing_cast<int>(data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8));
- aliasing_cast<int>(data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12));
- #endif
- }
- static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
- const typename V::AsArg v0, const typename V::AsArg v1)
- {
- const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
- const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
- V(tmp0).store(&data[i[0]], Vc::Unaligned);
- V(tmp1).store(&data[i[4]], Vc::Unaligned);
- }
- template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
- {
- #if defined Vc_USE_MASKMOV_SCATTER && !defined Vc_MSVC
- const __m64 mask = _mm_set_pi16(0, -1, -1, -1);
- const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
- const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
- const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data());
- const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data());
- const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
- const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
- const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
- const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
- _mm_maskmove_si64(_mm_movepi64_pi64(tmp4), mask, reinterpret_cast<char *>(&data[i[0]]));
- _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp4, 8)), mask, reinterpret_cast<char *>(&data[i[1]]));
- _mm_maskmove_si64(_mm_movepi64_pi64(tmp5), mask, reinterpret_cast<char *>(&data[i[2]]));
- _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp5, 8)), mask, reinterpret_cast<char *>(&data[i[3]]));
- _mm_maskmove_si64(_mm_movepi64_pi64(tmp6), mask, reinterpret_cast<char *>(&data[i[4]]));
- _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp6, 8)), mask, reinterpret_cast<char *>(&data[i[5]]));
- _mm_maskmove_si64(_mm_movepi64_pi64(tmp7), mask, reinterpret_cast<char *>(&data[i[6]]));
- _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp7, 8)), mask, reinterpret_cast<char *>(&data[i[7]]));
- _mm_empty();
- #else
- interleave(data, i, v0, v1);
- v2.scatter(data + 2, i);
- #endif
- }
- template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3)
- {
- const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
- const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
- const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
- const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
- const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
- const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
- const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
- const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
- _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4);
- _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5);
- _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6);
- _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7);
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4));
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5));
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6));
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7));
- }
- static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3)
- {
- const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
- const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
- const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
- const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
- const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
- const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
- const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
- const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
- V(tmp4).store(&data[i[0]], ::Vc::Unaligned);
- V(tmp5).store(&data[i[2]], ::Vc::Unaligned);
- V(tmp6).store(&data[i[4]], ::Vc::Unaligned);
- V(tmp7).store(&data[i[6]], ::Vc::Unaligned);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4)
- {
- interleave(data, i, v0, v1, v2, v3);
- v4.scatter(data + 4, i);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6, const typename V::AsArg v7)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6, v7);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1)
- {
- const __m128i a = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[0]]));
- const __m128i b = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[1]]));
- const __m128i c = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[2]]));
- const __m128i d = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[3]]));
- const __m128i e = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[4]]));
- const __m128i f = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[5]]));
- const __m128i g = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[6]]));
- const __m128i h = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[7]]));
- const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
- const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
- const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
- const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
- const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
- const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
- v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
- v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2)
- {
- const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
- const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
- const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
- const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
- const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
- const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
- const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
- const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));
- const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
- const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
- const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
- const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
- const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
- const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
- const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
- const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
- v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
- v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
- v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3)
- {
- const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
- const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
- const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
- const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
- const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
- const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
- const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
- const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));
- const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
- const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
- const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
- const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
- const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
- const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
- const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
- const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
- v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
- v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
- v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
- v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
- {
- const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
- const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
- const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
- const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
- const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
- const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
- const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
- const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
- const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
- const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
- const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
- const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
- const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
- const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
- const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
- const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
- const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
- const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
- const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
- const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
- const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
- const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
- v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
- v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
- v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
- v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
- v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
- {
- const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
- const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
- const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
- const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
- const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
- const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
- const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
- const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
- const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
- const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
- const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
- const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
- const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
- const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
- const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
- const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
- const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
- const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
- const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
- const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
- const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
- const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
- v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
- v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
- v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
- v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
- v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
- v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
- {
- const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
- const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
- const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
- const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
- const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
- const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
- const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
- const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
- const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
- const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
- const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
- const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
- const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
- const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
- const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
- const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
- const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
- const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
- const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
- const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
- const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
- const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
- const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11);
- const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13);
- v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
- v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
- v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
- v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
- v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
- v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
- v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
- {
- const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
- const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
- const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
- const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
- const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
- const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
- const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
- const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
- const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
- const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
- const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
- const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
- const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
- const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
- const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
- const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
- const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
- const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
- const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
- const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
- const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
- const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
- const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11);
- const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13);
- v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
- v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
- v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
- v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
- v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
- v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
- v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
- v7.data() = _mm_unpackhi_epi16(tmp14, tmp15);
- }
- };
- template<typename V> struct InterleaveImpl<V, 4, 16> {
- static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
- const typename V::AsArg v0, const typename V::AsArg v1)
- {
- const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
- const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), tmp0);
- _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), tmp1);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1)
- {
- const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
- const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
- _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
- _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2)
- {
- #ifdef Vc_USE_MASKMOV_SCATTER
- const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
- const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
- const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
- const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
- const __m128i mask = _mm_set_epi32(0, -1, -1, -1);
- _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast<char *>(&data[i[0]]));
- _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast<char *>(&data[i[1]]));
- _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast<char *>(&data[i[2]]));
- _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast<char *>(&data[i[3]]));
- #else
- const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
- const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
- _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
- _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
- v2.scatter(data + 2, i);
- #endif
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3)
- {
- const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
- const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
- const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
- const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1));
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4)
- {
- interleave(data, i, v0, v1, v2, v3);
- v4.scatter(data + 4, i);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6, const typename V::AsArg v7)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6, v7);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1)
- {
- const __m128 a = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[0]])));
- const __m128 b = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[1]])));
- const __m128 c = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[2]])));
- const __m128 d = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[3]])));
- const __m128 tmp0 = _mm_unpacklo_ps(a, b);
- const __m128 tmp1 = _mm_unpacklo_ps(c, d);
- v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
- v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2)
- {
- const __m128 a = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
- const __m128 b = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
- const __m128 c = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
- const __m128 d = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
- const __m128 tmp0 = _mm_unpacklo_ps(a, b);
- const __m128 tmp1 = _mm_unpacklo_ps(c, d);
- const __m128 tmp2 = _mm_unpackhi_ps(a, b);
- const __m128 tmp3 = _mm_unpackhi_ps(c, d);
- v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
- v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
- v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3)
- {
- const __m128 a = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
- const __m128 b = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
- const __m128 c = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
- const __m128 d = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
- const __m128 tmp0 = _mm_unpacklo_ps(a, b);
- const __m128 tmp1 = _mm_unpacklo_ps(c, d);
- const __m128 tmp2 = _mm_unpackhi_ps(a, b);
- const __m128 tmp3 = _mm_unpackhi_ps(c, d);
- v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
- v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
- v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
- v3.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp3, tmp2));
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- v4.gather(data + 4, i);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- deinterleave(data + 4, i, v4, v5);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- deinterleave(data + 4, i, v4, v5, v6);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- deinterleave(data + 4, i, v4, v5, v6, v7);
- }
- };
- template<typename V> struct InterleaveImpl<V, 2, 16> {
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1)
- {
- const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data());
- const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data());
- _mm_storeu_pd(&data[i[0]], tmp0);
- _mm_storeu_pd(&data[i[1]], tmp1);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2)
- {
- interleave(data, i, v0, v1);
- v2.scatter(data + 2, i);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3)
- {
- interleave(data, i, v0, v1);
- interleave(data + 2, i, v2, v3);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4)
- {
- interleave(data, i, v0, v1, v2, v3);
- v4.scatter(data + 4, i);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6, const typename V::AsArg v7)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6, v7);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1)
- {
- const __m128d a = _mm_loadu_pd(&data[i[0]]);
- const __m128d b = _mm_loadu_pd(&data[i[1]]);
- v0.data() = _mm_unpacklo_pd(a, b);
- v1.data() = _mm_unpackhi_pd(a, b);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2)
- {
- v2.gather(data + 2, i);
- deinterleave(data, i, v0, v1);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3)
- {
- deinterleave(data, i, v0, v1);
- deinterleave(data + 2, i, v2, v3);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
- {
- deinterleave(data, i, v0, v1);
- deinterleave(data + 2, i, v2, v3);
- v4.gather(data + 4, i);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
- {
- deinterleave(data, i, v0, v1);
- deinterleave(data + 2, i, v2, v3);
- deinterleave(data + 4, i, v4, v5);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
- {
- deinterleave(data, i, v0, v1);
- deinterleave(data + 2, i, v2, v3);
- deinterleave(data + 4, i, v4, v5);
- v6.gather(data + 6, i);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
- {
- deinterleave(data, i, v0, v1);
- deinterleave(data + 2, i, v2, v3);
- deinterleave(data + 4, i, v4, v5);
- deinterleave(data + 6, i, v6, v7);
- }
- };
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <size_t Size>
- Vc_INTRINSIC_L Vc_CONST_L int mask_count(__m128i) Vc_INTRINSIC_R Vc_CONST_R;
- template <size_t Size>
- Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m128i) Vc_INTRINSIC_R Vc_CONST_R;
- template <size_t Size>
- Vc_INTRINSIC_L Vc_CONST_L bool is_equal(__m128, __m128) Vc_INTRINSIC_R Vc_CONST_R;
- template <size_t Size>
- Vc_INTRINSIC_L Vc_CONST_L bool is_not_equal(__m128, __m128) Vc_INTRINSIC_R Vc_CONST_R;
- }
- using SSE::sse_cast;
- template <typename T> class Mask<T, VectorAbi::Sse>
- {
- using abi = VectorAbi::Sse;
- friend class Mask< double, abi>;
- friend class Mask< float, abi>;
- friend class Mask< int32_t, abi>;
- friend class Mask<uint32_t, abi>;
- friend class Mask< int16_t, abi>;
- friend class Mask<uint16_t, abi>;
- typedef Common::MaskBool<sizeof(T)> MaskBool;
- typedef Common::Storage<T, SSE::VectorTraits<T>::Size> Storage;
- public:
- typedef bool EntryType;
- using value_type = EntryType;
- using EntryReference = Detail::ElementReference<Mask>;
- using reference = EntryReference;
- typedef MaskBool VectorEntryType;
- using VectorType = typename Storage::VectorType;
- using Vector = SSE::Vector<T>;
- public:
- Vc_FREE_STORE_OPERATORS_ALIGNED(16);
- static constexpr size_t Size = SSE::VectorTraits<T>::Size;
- static constexpr size_t MemoryAlignment = Size;
- static constexpr std::size_t size() { return Size; }
- #if defined Vc_MSVC && defined _WIN32
- typedef const Mask &Argument;
- #else
- typedef Mask Argument;
- #endif
- Vc_INTRINSIC Mask() = default;
- Vc_INTRINSIC Mask(const Mask &) = default;
- Vc_INTRINSIC Mask &operator=(const Mask &) = default;
- Vc_INTRINSIC Mask(const __m128 &x) : d(sse_cast<VectorType>(x)) {}
- Vc_INTRINSIC Mask(const __m128d &x) : d(sse_cast<VectorType>(x)) {}
- Vc_INTRINSIC Mask(const __m128i &x) : d(sse_cast<VectorType>(x)) {}
- Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : Mask(_mm_setzero_ps()) {}
- Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : Mask(SSE::_mm_setallone_ps()) {}
- Vc_INTRINSIC explicit Mask(bool b) : Mask(b ? SSE::_mm_setallone_ps() : _mm_setzero_ps()) {}
- Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; }
- Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; }
- template <typename U>
- Vc_INTRINSIC Mask(
- U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
- : d(sse_cast<VectorType>(
- Detail::mask_cast<Traits::simd_vector_size<U>::value, Size, __m128>(
- rhs.dataI())))
- {
- }
- #if Vc_IS_VERSION_1
- template <typename U>
- Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
- "mask types") Vc_INTRINSIC
- explicit Mask(U &&rhs,
- Common::enable_if_mask_converts_explicitly<T, U> = nullarg);
- #endif
- Vc_ALWAYS_INLINE explicit Mask(const bool *mem) { load(mem); }
- template<typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags f) { load(mem, f); }
- Vc_ALWAYS_INLINE_L void load(const bool *mem) Vc_ALWAYS_INLINE_R;
- template<typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags) { load(mem); }
- Vc_ALWAYS_INLINE_L void store(bool *) const Vc_ALWAYS_INLINE_R;
- template<typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags) const { store(mem); }
- Vc_ALWAYS_INLINE Vc_PURE bool operator==(const Mask &rhs) const
- {
- return Detail::is_equal<Size>(dataF(), rhs.dataF());
- }
- Vc_ALWAYS_INLINE Vc_PURE bool operator!=(const Mask &rhs) const
- {
- return Detail::is_not_equal<Size>(dataF(), rhs.dataF());
- }
- Vc_ALWAYS_INLINE Vc_PURE Mask operator!() const
- {
- #ifdef Vc_GCC
- return ~dataI();
- #else
- return _mm_andnot_si128(dataI(), SSE::_mm_setallone_si128());
- #endif
- }
- Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_and_ps(dataF(), rhs.dataF())); return *this; }
- Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_or_ps (dataF(), rhs.dataF())); return *this; }
- Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_xor_ps(dataF(), rhs.dataF())); return *this; }
- Vc_ALWAYS_INLINE Vc_PURE Mask operator&(const Mask &rhs) const { return _mm_and_ps(dataF(), rhs.dataF()); }
- Vc_ALWAYS_INLINE Vc_PURE Mask operator|(const Mask &rhs) const { return _mm_or_ps (dataF(), rhs.dataF()); }
- Vc_ALWAYS_INLINE Vc_PURE Mask operator^(const Mask &rhs) const { return _mm_xor_ps(dataF(), rhs.dataF()); }
- Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &rhs) const { return _mm_and_ps(dataF(), rhs.dataF()); }
- Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &rhs) const { return _mm_or_ps (dataF(), rhs.dataF()); }
- Vc_ALWAYS_INLINE Vc_PURE bool isFull () const { return
- #ifdef Vc_USE_PTEST
- _mm_testc_si128(dataI(), SSE::_mm_setallone_si128());
- #else
- _mm_movemask_epi8(dataI()) == 0xffff;
- #endif
- }
- Vc_ALWAYS_INLINE Vc_PURE bool isNotEmpty() const { return
- #ifdef Vc_USE_PTEST
- 0 == _mm_testz_si128(dataI(), dataI());
- #else
- _mm_movemask_epi8(dataI()) != 0x0000;
- #endif
- }
- Vc_ALWAYS_INLINE Vc_PURE bool isEmpty() const { return
- #ifdef Vc_USE_PTEST
- 0 != _mm_testz_si128(dataI(), dataI());
- #else
- _mm_movemask_epi8(dataI()) == 0x0000;
- #endif
- }
- Vc_ALWAYS_INLINE Vc_PURE bool isMix() const {
- #ifdef Vc_USE_PTEST
- return _mm_test_mix_ones_zeros(dataI(), SSE::_mm_setallone_si128());
- #else
- const int tmp = _mm_movemask_epi8(dataI());
- return tmp != 0 && (tmp ^ 0xffff) != 0;
- #endif
- }
- Vc_ALWAYS_INLINE Vc_PURE int shiftMask() const { return _mm_movemask_epi8(dataI()); }
- Vc_ALWAYS_INLINE Vc_PURE int toInt() const { return Detail::mask_to_int<Size>(dataI()); }
- Vc_ALWAYS_INLINE Vc_PURE VectorType data() const { return d.v(); }
- Vc_ALWAYS_INLINE Vc_PURE __m128 dataF() const { return SSE::sse_cast<__m128 >(d.v()); }
- Vc_ALWAYS_INLINE Vc_PURE __m128i dataI() const { return SSE::sse_cast<__m128i>(d.v()); }
- Vc_ALWAYS_INLINE Vc_PURE __m128d dataD() const { return SSE::sse_cast<__m128d>(d.v()); }
- private:
- friend reference;
- static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept
- {
- return MaskBool(m.d.m(i));
- }
- template <typename U>
- static Vc_INTRINSIC void set(Mask &m, int i,
- U &&v) noexcept(noexcept(MaskBool(std::declval<U>())))
- {
- m.d.set(i, MaskBool(std::forward<U>(v)));
- }
- public:
- Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
- {
- return {*this, int(index)};
- }
- Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept
- {
- return get(*this, index);
- }
- Vc_ALWAYS_INLINE Vc_PURE int count() const
- {
- return Detail::mask_count<Size>(dataI());
- }
- Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
- template <typename G> static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R;
- private:
- #ifdef Vc_COMPILE_BENCHMARKS
- public:
- #endif
- Storage d;
- };
- template <typename T> constexpr size_t Mask<T, VectorAbi::Sse>::Size;
- template <typename T> constexpr size_t Mask<T, VectorAbi::Sse>::MemoryAlignment;
- }
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template<> Vc_INTRINSIC Vc_CONST int mask_count<2>(__m128i k)
- {
- int mask = _mm_movemask_pd(_mm_castsi128_pd(k));
- return (mask & 1) + (mask >> 1);
- }
- template<> Vc_INTRINSIC Vc_CONST int mask_count<4>(__m128i k)
- {
- #ifdef Vc_IMPL_POPCNT
- return _mm_popcnt_u32(_mm_movemask_ps(_mm_castsi128_ps(k)));
- #else
- auto x = _mm_srli_epi32(k, 31);
- x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
- x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)));
- return _mm_cvtsi128_si32(x);
- #endif
- }
- template<> Vc_INTRINSIC Vc_CONST int mask_count<8>(__m128i k)
- {
- #ifdef Vc_IMPL_POPCNT
- return _mm_popcnt_u32(_mm_movemask_epi8(k)) / 2;
- #else
- auto x = _mm_srli_epi16(k, 15);
- x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
- x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)));
- x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)));
- return _mm_extract_epi16(x, 0);
- #endif
- }
- template<> Vc_INTRINSIC Vc_CONST int mask_count<16>(__m128i k)
- {
- return Detail::popcnt16(_mm_movemask_epi8(k));
- }
- template<> Vc_INTRINSIC Vc_CONST int mask_to_int<2>(__m128i k)
- {
- return _mm_movemask_pd(_mm_castsi128_pd(k));
- }
- template<> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m128i k)
- {
- return _mm_movemask_ps(_mm_castsi128_ps(k));
- }
- template<> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m128i k)
- {
- return _mm_movemask_epi8(_mm_packs_epi16(k, _mm_setzero_si128()));
- }
- template<> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m128i k)
- {
- return _mm_movemask_epi8(k);
- }
- template <size_t> Vc_ALWAYS_INLINE void mask_store(__m128i k, bool *mem);
- template <> Vc_ALWAYS_INLINE void mask_store<16>(__m128i k, bool *mem)
- {
- _mm_store_si128(reinterpret_cast<__m128i *>(mem), _mm_and_si128(k, _mm_set1_epi8(1)));
- }
- template <> Vc_ALWAYS_INLINE void mask_store<8>(__m128i k, bool *mem)
- {
- k = _mm_srli_epi16(k, 15);
- const auto k2 = _mm_packs_epi16(k, _mm_setzero_si128());
- #ifdef __x86_64__
- *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k2);
- #else
- _mm_store_sd(aliasing_cast<double>(mem), _mm_castsi128_pd(k2));
- #endif
- }
- template <> Vc_ALWAYS_INLINE void mask_store<4>(__m128i k, bool *mem)
- {
- *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(
- _mm_packs_epi16(_mm_srli_epi16(_mm_packs_epi32(k, _mm_setzero_si128()), 15),
- _mm_setzero_si128()));
- }
- template <> Vc_ALWAYS_INLINE void mask_store<2>(__m128i k, bool *mem)
- {
- mem[0] = -SseIntrinsics::extract_epi32<1>(k);
- mem[1] = -SseIntrinsics::extract_epi32<3>(k);
- }
- template<size_t> Vc_ALWAYS_INLINE __m128 mask_load(const bool *mem);
- template<> Vc_ALWAYS_INLINE __m128 mask_load<16>(const bool *mem)
- {
- return sse_cast<__m128>(_mm_cmpgt_epi8(
- _mm_load_si128(reinterpret_cast<const __m128i *>(mem)), _mm_setzero_si128()));
- }
- template<> Vc_ALWAYS_INLINE __m128 mask_load<8>(const bool *mem)
- {
- #ifdef __x86_64__
- __m128i k = _mm_cvtsi64_si128(*reinterpret_cast<const int64_t *>(mem));
- #else
- __m128i k = _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(mem)));
- #endif
- return sse_cast<__m128>(_mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
- }
- template<> Vc_ALWAYS_INLINE __m128 mask_load<4>(const bool *mem)
- {
- __m128i k = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem));
- k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
- return sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
- }
- template<> Vc_ALWAYS_INLINE __m128 mask_load<2>(const bool *mem)
- {
- return sse_cast<__m128>(
- _mm_set_epi32(-int(mem[1]), -int(mem[1]), -int(mem[0]), -int(mem[0])));
- }
- template <> Vc_INTRINSIC Vc_CONST bool is_equal<2>(__m128 k1, __m128 k2)
- {
- return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2));
- }
- template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<2>(__m128 k1, __m128 k2)
- {
- return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2));
- }
- template <> Vc_INTRINSIC Vc_CONST bool is_equal<4>(__m128 k1, __m128 k2)
- {
- return _mm_movemask_ps(k1) == _mm_movemask_ps(k2);
- }
- template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<4>(__m128 k1, __m128 k2)
- {
- return _mm_movemask_ps(k1) != _mm_movemask_ps(k2);
- }
- template <> Vc_INTRINSIC Vc_CONST bool is_equal<8>(__m128 k1, __m128 k2)
- {
- return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
- _mm_movemask_epi8(_mm_castps_si128(k2));
- }
- template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<8>(__m128 k1, __m128 k2)
- {
- return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
- _mm_movemask_epi8(_mm_castps_si128(k2));
- }
- template <> Vc_INTRINSIC Vc_CONST bool is_equal<16>(__m128 k1, __m128 k2)
- {
- return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
- _mm_movemask_epi8(_mm_castps_si128(k2));
- }
- template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<16>(__m128 k1, __m128 k2)
- {
- return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
- _mm_movemask_epi8(_mm_castps_si128(k2));
- }
- }
- template<> Vc_ALWAYS_INLINE void SSE::double_m::store(bool *mem) const
- {
- *aliasing_cast<uint16_t>(mem) = _mm_movemask_epi8(dataI()) & 0x0101;
- }
- template<typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::store(bool *mem) const
- {
- Detail::mask_store<Size>(dataI(), mem);
- }
- template<> Vc_ALWAYS_INLINE void SSE::double_m::load(const bool *mem)
- {
- d.set(0, MaskBool(mem[0]));
- d.set(1, MaskBool(mem[1]));
- }
- template <typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::load(const bool *mem)
- {
- d.v() = sse_cast<VectorType>(Detail::mask_load<Size>(mem));
- }
- template <>
- Vc_INTRINSIC Vc_PURE bool SSE::short_m::get(const SSE::short_m &m, int index) noexcept
- {
- return m.shiftMask() & (1 << 2 * index);
- }
- template <>
- Vc_INTRINSIC Vc_PURE bool SSE::ushort_m::get(const SSE::ushort_m &m, int index) noexcept
- {
- return m.shiftMask() & (1 << 2 * index);
- }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE int Mask<T, VectorAbi::Sse>::firstOne() const
- {
- const int mask = toInt();
- #ifdef _MSC_VER
- unsigned long bit;
- _BitScanForward(&bit, mask);
- #else
- int bit;
- __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask));
- #endif
- return bit;
- }
- template <typename M, typename G>
- Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 2>)
- {
- return _mm_set_epi64x(gen(1) ? 0xffffffffffffffffull : 0,
- gen(0) ? 0xffffffffffffffffull : 0);
- }
- template <typename M, typename G>
- Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4>)
- {
- return _mm_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
- gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0);
- }
- template <typename M, typename G>
- Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8>)
- {
- return _mm_setr_epi16(gen(0) ? 0xffffu : 0, gen(1) ? 0xffffu : 0,
- gen(2) ? 0xffffu : 0, gen(3) ? 0xffffu : 0,
- gen(4) ? 0xffffu : 0, gen(5) ? 0xffffu : 0,
- gen(6) ? 0xffffu : 0, gen(7) ? 0xffffu : 0);
- }
- template <typename T>
- template <typename G>
- Vc_INTRINSIC Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::generate(G &&gen)
- {
- return generate_impl<Mask<T, VectorAbi::Sse>>(std::forward<G>(gen),
- std::integral_constant<int, Size>());
- }
- template <typename T> Vc_INTRINSIC Vc_PURE Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::shifted(int amount) const
- {
- switch (amount * int(sizeof(VectorEntryType))) {
- case 0: return *this;
- case 1: return Detail::shifted< 1>(dataI());
- case 2: return Detail::shifted< 2>(dataI());
- case 3: return Detail::shifted< 3>(dataI());
- case 4: return Detail::shifted< 4>(dataI());
- case 5: return Detail::shifted< 5>(dataI());
- case 6: return Detail::shifted< 6>(dataI());
- case 7: return Detail::shifted< 7>(dataI());
- case 8: return Detail::shifted< 8>(dataI());
- case 9: return Detail::shifted< 9>(dataI());
- case 10: return Detail::shifted< 10>(dataI());
- case 11: return Detail::shifted< 11>(dataI());
- case 12: return Detail::shifted< 12>(dataI());
- case 13: return Detail::shifted< 13>(dataI());
- case 14: return Detail::shifted< 14>(dataI());
- case 15: return Detail::shifted< 15>(dataI());
- case 16: return Detail::shifted< 16>(dataI());
- case -1: return Detail::shifted< -1>(dataI());
- case -2: return Detail::shifted< -2>(dataI());
- case -3: return Detail::shifted< -3>(dataI());
- case -4: return Detail::shifted< -4>(dataI());
- case -5: return Detail::shifted< -5>(dataI());
- case -6: return Detail::shifted< -6>(dataI());
- case -7: return Detail::shifted< -7>(dataI());
- case -8: return Detail::shifted< -8>(dataI());
- case -9: return Detail::shifted< -9>(dataI());
- case -10: return Detail::shifted<-10>(dataI());
- case -11: return Detail::shifted<-11>(dataI());
- case -12: return Detail::shifted<-12>(dataI());
- case -13: return Detail::shifted<-13>(dataI());
- case -14: return Detail::shifted<-14>(dataI());
- case -15: return Detail::shifted<-15>(dataI());
- case -16: return Detail::shifted<-16>(dataI());
- }
- return Zero();
- }
- }
- #endif
- #include <algorithm>
- #include <cmath>
- #ifdef isfinite
- #undef isfinite
- #endif
- #ifdef isnan
- #undef isnan
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- #define Vc_CURRENT_CLASS_NAME Vector
- template <typename T> class Vector<T, VectorAbi::Sse>
- {
- static_assert(std::is_arithmetic<T>::value,
- "Vector<T> only accepts arithmetic builtin types as template parameter T.");
- protected:
- #ifdef Vc_COMPILE_BENCHMARKS
- public:
- #endif
- typedef typename SSE::VectorTraits<T>::StorageType StorageType;
- StorageType d;
- typedef typename SSE::VectorTraits<T>::GatherMaskType GatherMask;
- typedef SSE::VectorHelper<typename SSE::VectorTraits<T>::VectorType> HV;
- typedef SSE::VectorHelper<T> HT;
- public:
- Vc_FREE_STORE_OPERATORS_ALIGNED(16);
- typedef typename SSE::VectorTraits<T>::VectorType VectorType;
- using vector_type = VectorType;
- static constexpr size_t Size = SSE::VectorTraits<T>::Size;
- static constexpr size_t MemoryAlignment = alignof(VectorType);
- typedef typename SSE::VectorTraits<T>::EntryType EntryType;
- using value_type = EntryType;
- using VectorEntryType = EntryType;
- using IndexType = fixed_size_simd<int, Size>;
- typedef typename SSE::VectorTraits<T>::MaskType Mask;
- using MaskType = Mask;
- using mask_type = Mask;
- typedef typename Mask::Argument MaskArg;
- typedef typename Mask::Argument MaskArgument;
- typedef const Vector AsArg;
- using abi = VectorAbi::Sse;
- using WriteMaskedVector = Common::WriteMaskedVector<Vector, Mask>;
- template <typename U> using V = Vector<U, abi>;
- using reference = Detail::ElementReference<Vector>;
- public:
- Vc_INTRINSIC Vector() = default;
- static constexpr std::size_t size() { return Size; }
- explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
- explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
- explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
- static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
- static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
- static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
- {
- return Vector(Vc::IndexesFromZero);
- }
- template <class G, int = 0,
- class = typename std::enable_if<std::is_convertible<
- decltype(std::declval<G>()(size_t())), value_type>::value>::type>
- explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
- {
- }
- static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R;
- Vc_ALWAYS_INLINE Vector(VectorType x) : d(x) {}
- template <typename U>
- Vc_INTRINSIC Vector(
- V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
- void *>::type = nullptr)
- : d(SSE::convert<U, T>(x.data()))
- {
- }
- #if Vc_IS_VERSION_1
- template <typename U>
- Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
- "vector types") Vc_INTRINSIC
- explicit Vector(
- V<U> x,
- typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
- void *>::type = nullptr)
- : d(SSE::convert<U, T>(x.data()))
- {
- }
- #endif
- Vc_INTRINSIC Vector(EntryType a) : d(HT::set(a)) {}
- template <typename U>
- Vc_INTRINSIC Vector(U a,
- typename std::enable_if<std::is_same<U, int>::value &&
- !std::is_same<U, EntryType>::value,
- void *>::type = nullptr)
- : Vector(static_cast<EntryType>(a))
- {
- }
- explicit Vc_INTRINSIC Vector(const EntryType *mem)
- {
- load(mem);
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
- {
- load(mem, flags);
- }
- template <typename U, typename Flags = DefaultLoadTag,
- typename = enable_if<
- (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
- sizeof(EntryType) >= sizeof(U)) &&
- std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
- {
- load<U, Flags>(x, flags);
- }
- Vc_INTRINSIC void load(const EntryType *mem)
- {
- load(mem, DefaultLoadTag());
- }
- template <typename Flags>
- Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
- load(const EntryType *mem, Flags flags)
- {
- load<EntryType, Flags>(mem, flags);
- }
- private:
- template <typename U, typename Flags>
- struct load_concept : public std::enable_if<
- (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
- sizeof(EntryType) >= sizeof(U)) &&
- std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
- {};
- public:
- template <typename U, typename Flags = DefaultLoadTag>
- Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
- template <
- typename U,
- typename Flags = DefaultStoreTag,
- typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
- template <
- typename U,
- typename Flags = DefaultStoreTag,
- typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
- Vc_INTRINSIC void store(EntryType *mem) const
- {
- store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
- {
- store<EntryType, Flags>(mem, flags);
- }
- Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
- {
- store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
- {
- store<EntryType, Flags>(mem, mask, flags);
- }
- Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R;
- Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R;
- Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R;
- Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
- Vc_INTRINSIC_L void setQnan(const Mask &k) Vc_INTRINSIC_R;
- #ifndef Vc_CURRENT_CLASS_NAME
- #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
- #endif
- private:
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
- MaskArgument mask);
- public:
- #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<MT, EntryType>::value, \
- "The memory pointer needs to point to a type that can be converted to the " \
- "EntryType of this SIMD vector type."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT, typename IT,
- typename = enable_if<Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
- private:
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes) const;
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
- public:
- #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<EntryType, MT>::value, \
- "The memory pointer needs to point to a type that the EntryType of this " \
- "SIMD vector type can be converted to."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes));
- }
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes), mask);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
- {
- scatter(args.address, args.indexes);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
- {
- scatter(args.address, args.indexes, mask);
- }
- #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
- #if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
- template <class U, class A, int Scale, int N = Vector<U, A>::size(),
- class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
- Vc_INTRINSIC void gatherImplementation(
- const Common::GatherArguments<T, Vector<U, A>, Scale> &args)
- {
- d.v() = SSE::gather<sizeof(T) * Scale>(
- args.address, simd_cast<SSE::int_v>(args.indexes).data());
- }
- template <class U, class A, int Scale, int N = Vector<U, A>::size(),
- class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
- Vc_INTRINSIC void gatherImplementation(
- const Common::GatherArguments<T, Vector<U, A>, Scale> &args, MaskArgument k)
- {
- d.v() = SSE::gather<sizeof(T) * Scale>(
- d.v(), k.data(), args.address,
- simd_cast<SSE::int_v>(args.indexes).data());
- }
- template <
- class MT, class U, class A, int Scale,
- class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
- (sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
- Vc_INTRINSIC void gatherImplementation(
- const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
- {
- using AVX2::int_v;
- const auto idx = simd_cast<int_v>(args.indexes).data();
- *this = simd_cast<Vector>(int_v(
- AVX::gather<sizeof(MT) * Scale>(aliasing_cast<int>(args.address), idx)));
- if (sizeof(MT) == 1) {
- if (std::is_signed<MT>::value) {
- d.v() = _mm_srai_epi16(_mm_slli_epi16(d.v(), 8), 8);
- } else {
- *this &= 0xff;
- }
- }
- }
- template <
- class MT, class U, class A, int Scale,
- class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
- (sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
- Vc_INTRINSIC void gatherImplementation(
- const Common::GatherArguments<MT, Vector<U, A>, Scale> &args, MaskArgument k)
- {
- using AVX2::int_v;
- auto v = simd_cast<Vector>(int_v(AVX::gather<sizeof(MT) * Scale>(
- _mm256_setzero_si256(), simd_cast<AVX2::int_m>(k).data(),
- aliasing_cast<int>(args.address),
- simd_cast<int_v>(args.indexes).data())));
- if (sizeof(MT) == 1) {
- if (std::is_signed<MT>::value) {
- v.data() = _mm_srai_epi16(_mm_slli_epi16(v.data(), 8), 8);
- } else {
- v &= 0xff;
- }
- }
- assign(v, k);
- }
- template <class MT, class U, class A, int Scale>
- Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
- Traits::is_valid_vector_argument<MT>::value &&
- !std::is_same<MT, T>::value &&
- Vector<U, A>::size() >= size()),
- void>
- gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
- {
- *this = simd_cast<Vector>(fixed_size_simd<MT, Size>(args));
- }
- template <class MT, class U, class A, int Scale>
- Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
- Traits::is_valid_vector_argument<MT>::value &&
- !std::is_same<MT, T>::value &&
- Vector<U, A>::size() >= size()),
- void>
- gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args,
- MaskArgument k)
- {
- assign(simd_cast<Vector>(fixed_size_simd<MT, Size>(args, k)), k);
- }
- #endif
- Vc_INTRINSIC Vector &operator++() { data() = HT::add(data(), HT::one()); return *this; }
- Vc_INTRINSIC Vector &operator--() { data() = HT::sub(data(), HT::one()); return *this; }
- Vc_INTRINSIC Vector operator++(int) { const Vector r = *this; data() = HT::add(data(), HT::one()); return r; }
- Vc_INTRINSIC Vector operator--(int) { const Vector r = *this; data() = HT::sub(data(), HT::one()); return r; }
- private:
- friend reference;
- Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
- {
- return o.d.m(i);
- }
- template <typename U>
- Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
- noexcept(std::declval<value_type &>() = v))
- {
- o.d.set(i, v);
- }
- public:
- Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
- {
- static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
- return {*this, int(index)};
- }
- Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
- {
- return d.m(index);
- }
- Vc_INTRINSIC_L Vector Vc_VDECL operator[](const SSE::int_v &perm) const Vc_INTRINSIC_R;
- Vc_INTRINSIC Vc_PURE Mask operator!() const
- {
- return *this == Zero();
- }
- Vc_INTRINSIC Vc_PURE Vector operator~() const
- {
- #ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
- static_assert(std::is_integral<T>::value,
- "bit-complement can only be used with Vectors of integral type");
- #endif
- return Detail::andnot_(data(), HV::allone());
- }
- Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
- Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; }
- Vc_ALWAYS_INLINE Vector Vc_VDECL operator<< (AsArg shift) const { return generate([&](int i) { return get(*this, i) << get(shift, i); }); }
- Vc_ALWAYS_INLINE Vector Vc_VDECL operator>> (AsArg shift) const { return generate([&](int i) { return get(*this, i) >> get(shift, i); }); }
- Vc_ALWAYS_INLINE Vector &Vc_VDECL operator<<=(AsArg shift) { return *this = *this << shift; }
- Vc_ALWAYS_INLINE Vector &Vc_VDECL operator>>=(AsArg shift) { return *this = *this >> shift; }
- Vc_INTRINSIC_L Vector &Vc_VDECL operator<<=( int shift) Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vector Vc_VDECL operator<< ( int shift) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vector &Vc_VDECL operator>>=( int shift) Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vector Vc_VDECL operator>> ( int shift) const Vc_INTRINSIC_R;
- Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
- isNegative() const
- {
- return Vc::isnegative(*this);
- }
- Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &mask)
- {
- data() = HV::blend(data(), v.data(), mask.data());
- }
- template <typename V2>
- Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast")
- Vc_ALWAYS_INLINE Vc_PURE V2 staticCast() const
- {
- return SSE::convert<T, typename V2::EntryType>(data());
- }
- template <typename V2>
- Vc_DEPRECATED("use reinterpret_components_cast instead")
- Vc_ALWAYS_INLINE Vc_PURE V2 reinterpretCast() const
- {
- return SSE::sse_cast<typename V2::VectorType>(data());
- }
- Vc_INTRINSIC WriteMaskedVector operator()(const Mask &k) { return {*this, k}; }
- Vc_ALWAYS_INLINE Vc_PURE VectorType &data() { return d.v(); }
- Vc_ALWAYS_INLINE Vc_PURE const VectorType &data() const { return d.v(); }
- template<int Index>
- Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R;
- Vc_INTRINSIC EntryType min() const { return HT::min(data()); }
- Vc_INTRINSIC EntryType max() const { return HT::max(data()); }
- Vc_INTRINSIC EntryType product() const { return HT::mul(data()); }
- Vc_INTRINSIC EntryType sum() const { return HT::add(data()); }
- Vc_INTRINSIC_L Vector partialSum() const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L EntryType min(MaskArg m) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L EntryType max(MaskArg m) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L EntryType product(MaskArg m) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L EntryType sum(MaskArg m) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R;
- Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
- template <typename F> void callWithValuesSorted(F &&f)
- {
- EntryType value = d.m(0);
- f(value);
- for (std::size_t i = 1; i < Size; ++i) {
- if (d.m(i) != value) {
- value = d.m(i);
- f(value);
- }
- }
- }
- template <typename F> Vc_INTRINSIC void call(F &&f) const
- {
- Common::for_all_vector_entries<Size>([&](size_t i) { f(EntryType(d.m(i))); });
- }
- template <typename F> Vc_INTRINSIC void call(F &&f, const Mask &mask) const
- {
- for(size_t i : where(mask)) {
- f(EntryType(d.m(i)));
- }
- }
- template <typename F> Vc_INTRINSIC Vector apply(F &&f) const
- {
- Vector r;
- Common::for_all_vector_entries<Size>(
- [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); });
- return r;
- }
- template <typename F> Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const
- {
- Vector r(*this);
- for (size_t i : where(mask)) {
- r.d.set(i, f(EntryType(r.d.m(i))));
- }
- return r;
- }
- template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
- Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f(i)); });
- }
- Vc_INTRINSIC void fill(EntryType (&f)()) {
- Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f()); });
- }
- template <typename G> static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R;
- Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector
- copySign(AsArg x) const
- {
- return Vc::copysign(*this, x);
- }
- Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
- {
- return Vc::exponent(*this);
- }
- Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R;
- };
- #undef Vc_CURRENT_CLASS_NAME
- template <typename T> constexpr size_t Vector<T, VectorAbi::Sse>::Size;
- template <typename T> constexpr size_t Vector<T, VectorAbi::Sse>::MemoryAlignment;
- static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v min(const SSE::int_v &x, const SSE::int_v &y) { return SSE::min_epi32(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v min(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::min_epu32(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v min(const SSE::short_v &x, const SSE::short_v &y) { return _mm_min_epi16(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v min(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::min_epu16(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v min(const SSE::float_v &x, const SSE::float_v &y) { return _mm_min_ps(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v min(const SSE::double_v &x, const SSE::double_v &y) { return _mm_min_pd(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v max(const SSE::int_v &x, const SSE::int_v &y) { return SSE::max_epi32(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v max(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::max_epu32(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v max(const SSE::short_v &x, const SSE::short_v &y) { return _mm_max_epi16(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v max(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::max_epu16(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v max(const SSE::float_v &x, const SSE::float_v &y) { return _mm_max_ps(x.data(), y.data()); }
- static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v max(const SSE::double_v &x, const SSE::double_v &y) { return _mm_max_pd(x.data(), y.data()); }
- template <typename T,
- typename = enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value ||
- std::is_same<T, short>::value ||
- std::is_same<T, int>::value>>
- Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> abs(Vector<T, VectorAbi::Sse> x)
- {
- return SSE::VectorHelper<T>::abs(x.data());
- }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> sqrt (const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::sqrt(x.data()); }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> rsqrt(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::rsqrt(x.data()); }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> reciprocal(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::reciprocal(x.data()); }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> round(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::round(x.data()); }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isfinite(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isFinite(x.data()); }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isinf(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isInfinite(x.data()); }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isnan(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isNaN(x.data()); }
- #define Vc_CONDITIONAL_ASSIGN(name_,op_) \
- template <Operator O, typename T, typename M, typename U> \
- Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
- Vector<T, VectorAbi::Sse> &lhs, M &&mask, U &&rhs) \
- { \
- lhs(mask) op_ rhs; \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_CONDITIONAL_ASSIGN( Assign, =);
- Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
- Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
- Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
- Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
- Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
- Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
- Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
- Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
- Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
- Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
- #undef Vc_CONDITIONAL_ASSIGN
- #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
- template <Operator O, typename T, typename M> \
- Vc_INTRINSIC enable_if<O == Operator::name_, Vector<T, VectorAbi::Sse>> \
- conditional_assign(Vector<T, VectorAbi::Sse> &lhs, M &&mask) \
- { \
- return expr_; \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
- Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
- Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
- Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
- #undef Vc_CONDITIONAL_ASSIGN
- }
- #ifndef VC_COMMON_X86_PREFETCHES_H_
- #define VC_COMMON_X86_PREFETCHES_H_
- #include <xmmintrin.h>
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- static constexpr int exclusive_hint = 0;
- template <typename ExclusiveOrShared = Vc::Shared>
- Vc_INTRINSIC void prefetchForOneRead(const void *addr)
- {
- if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_NTA);
- } else {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
- static_cast<decltype(_MM_HINT_NTA)>(_MM_HINT_NTA | exclusive_hint));
- }
- }
- template <typename ExclusiveOrShared = Vc::Shared>
- Vc_INTRINSIC void prefetchClose(const void *addr)
- {
- if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T0);
- } else {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
- static_cast<decltype(_MM_HINT_T0)>(_MM_HINT_T0 | exclusive_hint));
- }
- }
- template <typename ExclusiveOrShared = Vc::Shared>
- Vc_INTRINSIC void prefetchMid(const void *addr)
- {
- if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T1);
- } else {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
- static_cast<decltype(_MM_HINT_T1)>(_MM_HINT_T1 | exclusive_hint));
- }
- }
- template <typename ExclusiveOrShared = Vc::Shared>
- Vc_INTRINSIC void prefetchFar(const void *addr)
- {
- if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T2);
- } else {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
- static_cast<decltype(_MM_HINT_T2)>(_MM_HINT_T2 | exclusive_hint));
- }
- }
- namespace
- {
- template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 != 0 && L2 != 0, void *>::type = nullptr)
- {
- const char *addr = static_cast<const char *>(addr_);
- prefetchClose<typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L1);
- prefetchMid <typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L2);
- }
- template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 == 0 && L2 != 0, void *>::type = nullptr)
- {
- const char *addr = static_cast<const char *>(addr_);
- prefetchMid <typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L2);
- }
- template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 != 0 && L2 == 0, void *>::type = nullptr)
- {
- const char *addr = static_cast<const char *>(addr_);
- prefetchClose<typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L1);
- }
- template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *, typename std::enable_if<L1 == 0 && L2 == 0, void *>::type = nullptr)
- {
- }
- template<typename Flags> Vc_INTRINSIC void handleLoadPrefetches(const void * , Flags, typename Flags::EnableIfNotPrefetch = nullptr) {}
- template<typename Flags> Vc_INTRINSIC void handleLoadPrefetches(const void *addr, Flags, typename Flags::EnableIfPrefetch = nullptr)
- {
- handlePrefetch<Flags::L1Stride, Flags::L2Stride, Flags::IsExclusivePrefetch>(addr);
- }
- template<typename Flags> Vc_INTRINSIC void handleStorePrefetches(const void * , Flags, typename Flags::EnableIfNotPrefetch = nullptr) {}
- template<typename Flags> Vc_INTRINSIC void handleStorePrefetches(const void *addr, Flags, typename Flags::EnableIfPrefetch = nullptr)
- {
- handlePrefetch<Flags::L1Stride, Flags::L2Stride, !Flags::IsSharedPrefetch>(addr);
- }
- }
- }
- using Common::prefetchForOneRead;
- using Common::prefetchClose;
- using Common::prefetchMid;
- using Common::prefetchFar;
- }
- #endif
- #ifndef VC_SSE_LIMITS_H_
- #define VC_SSE_LIMITS_H_
- namespace std
- {
- template<> struct numeric_limits< ::Vc::SSE::ushort_v> : public numeric_limits<unsigned short>
- {
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v max() Vc_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v min() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v lowest() Vc_NOEXCEPT { return min(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
- };
- template<> struct numeric_limits< ::Vc::SSE::short_v> : public numeric_limits<short>
- {
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v max() Vc_NOEXCEPT { return _mm_srli_epi16(::Vc::SSE::_mm_setallone_si128(), 1); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v min() Vc_NOEXCEPT { return ::Vc::SSE::setmin_epi16(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v lowest() Vc_NOEXCEPT { return min(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
- };
- template<> struct numeric_limits< ::Vc::SSE::uint_v> : public numeric_limits<unsigned int>
- {
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v max() Vc_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v min() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v lowest() Vc_NOEXCEPT { return min(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
- };
- template<> struct numeric_limits< ::Vc::SSE::int_v> : public numeric_limits<int>
- {
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v max() Vc_NOEXCEPT { return _mm_srli_epi32(::Vc::SSE::_mm_setallone_si128(), 1); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v min() Vc_NOEXCEPT { return ::Vc::SSE::setmin_epi32(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v lowest() Vc_NOEXCEPT { return min(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
- static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
- };
- }
- #endif
- #ifndef VC_COMMON_BITSCANINTRINSICS_H_
- #define VC_COMMON_BITSCANINTRINSICS_H_
- #if defined(Vc_GCC) || defined(Vc_CLANG) || defined(Vc_APPLECLANG)
- # if Vc_GCC >= 0x40500
- # include <x86intrin.h>
- # else
- #define _bit_scan_forward(x) __builtin_ctz(x)
- static Vc_ALWAYS_INLINE Vc_CONST int _Vc_bit_scan_reverse_asm(unsigned int x) {
- int r;
- __asm__("bsr %1,%0" : "=r"(r) : "X"(x));
- return r;
- }
- #define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x)
- # endif
- #elif defined(_WIN32)
- #include <intrin.h>
- static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) {
- unsigned long index;
- _BitScanForward(&index, x);
- return index;
- }
- static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) {
- unsigned long index;
- _BitScanReverse(&index, x);
- return index;
- }
- #elif defined(Vc_ICC)
- #else
- #endif
- #endif
- #ifndef VC_COMMON_SET_H_
- #define VC_COMMON_SET_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace
- {
- static Vc_INTRINSIC Vc_CONST __m128i set(unsigned short x0, unsigned short x1, unsigned short x2, unsigned short x3,
- unsigned short x4, unsigned short x5, unsigned short x6, unsigned short x7)
- {
- #if defined(Vc_GNU_ASM)
- #if 0
- __m128i r;
- unsigned long long tmp0 = x3; tmp0 = (tmp0 << 16) | x2;
- unsigned long long tmp1 = x1; tmp1 = (tmp1 << 16) | x0;
- asm("vmovq %1,%0" : "=x"(r) : "r"((tmp0 << 32) | tmp1));
- unsigned long long tmp2 = x7; tmp2 = (tmp2 << 16) | x6;
- unsigned long long tmp3 = x5; tmp3 = (tmp3 << 16) | x4;
- asm("vpinsrq $1,%1,%0,%0" : "+x"(r) : "r"((tmp2 << 32) | tmp3));
- return r;
- #elif defined(Vc_USE_VEX_CODING)
- __m128i r0, r1;
- unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
- unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
- unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
- unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
- asm("vmovd %1,%0" : "=x"(r0) : "r"(tmp0));
- asm("vpinsrd $1,%1,%0,%0" : "+x"(r0) : "r"(tmp1));
- asm("vmovd %1,%0" : "=x"(r1) : "r"(tmp2));
- asm("vpinsrd $1,%1,%0,%0" : "+x"(r1) : "r"(tmp3));
- asm("vpunpcklqdq %1,%0,%0" : "+x"(r0) : "x"(r1));
- return r0;
- #else
- __m128i r0, r1;
- unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
- unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
- unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
- unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
- asm("movd %1,%0" : "=x"(r0) : "r"(tmp0));
- asm("pinsrd $1,%1,%0" : "+x"(r0) : "r"(tmp1));
- asm("movd %1,%0" : "=x"(r1) : "r"(tmp2));
- asm("pinsrd $1,%1,%0" : "+x"(r1) : "r"(tmp3));
- asm("punpcklqdq %1,%0" : "+x"(r0) : "x"(r1));
- return r0;
- #endif
- #else
- unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
- unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
- unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
- unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
- return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
- #endif
- }
- static Vc_INTRINSIC Vc_CONST __m128i set(short x0, short x1, short x2, short x3, short x4, short x5, short x6, short x7)
- {
- return set(static_cast<unsigned short>(x0), static_cast<unsigned short>(x1), static_cast<unsigned short>(x2),
- static_cast<unsigned short>(x3), static_cast<unsigned short>(x4), static_cast<unsigned short>(x5),
- static_cast<unsigned short>(x6), static_cast<unsigned short>(x7));
- }
- }
- }
- #endif
- #ifndef VC_COMMON_GATHERIMPLEMENTATION_H_
- #define VC_COMMON_GATHERIMPLEMENTATION_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- enum class GatherScatterImplementation : int {
- SimpleLoop,
- SetIndexZero,
- BitScanLoop,
- PopcntSwitch
- };
- using SimpleLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SimpleLoop>;
- using SetIndexZeroT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SetIndexZero>;
- using BitScanLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::BitScanLoop>;
- using PopcntSwitchT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::PopcntSwitch>;
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT,
- V &v,
- const MT *mem,
- IT &&indexes_,
- typename V::MaskArgument mask)
- {
- auto indexes = std::forward<IT>(indexes_);
- indexes.setZeroInverted(static_cast<decltype(!indexes)>(mask));
- const V tmp(mem, indexes);
- where(mask) | v = tmp;
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeGather(SimpleLoopT, V &v, const MT *mem, const IT &indexes,
- const typename V::MaskArgument mask)
- {
- if (Vc_IS_UNLIKELY(mask.isEmpty())) {
- return;
- }
- #if defined Vc_GCC && Vc_GCC >= 0x40900
- constexpr std::size_t Sizeof = sizeof(V);
- using Builtin [[gnu::vector_size(Sizeof)]] = typename V::value_type;
- Builtin tmp = reinterpret_cast<Builtin>(v.data());
- Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
- if (mask[i]) {
- tmp[i] = mem[indexes[i]];
- }
- });
- v.data() = reinterpret_cast<typename V::VectorType>(tmp);
- #else
- Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
- if (mask[i])
- v[i] = mem[indexes[i]];
- });
- #endif
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeGather(BitScanLoopT,
- V &v,
- const MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask)
- {
- #ifdef Vc_GNU_ASM
- size_t bits = mask.toInt();
- while (Vc_IS_LIKELY(bits > 0)) {
- size_t i, j;
- asm("bsf %[bits],%[i]\n\t"
- "bsr %[bits],%[j]\n\t"
- "btr %[i],%[bits]\n\t"
- "btr %[j],%[bits]\n\t"
- : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
- v[i] = mem[indexes[i]];
- v[j] = mem[indexes[j]];
- }
- #else
- int bits = mask.toInt();
- while (bits) {
- const int i = _bit_scan_forward(bits);
- bits &= bits - 1;
- v[i] = mem[indexes[i]];
- }
- #endif
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
- V &v,
- const MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask,
- enable_if<V::Size == 16> = nullarg)
- {
- unsigned int bits = mask.toInt();
- unsigned int low, high = 0;
- switch (Vc::Detail::popcnt16(bits)) {
- case 16:
- v.gather(mem, indexes);
- break;
- case 15:
- low = _bit_scan_forward(bits);
- bits ^= 1 << low;
- v[low] = mem[indexes[low]];
- case 14:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- high = (1 << high);
- case 13:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- v[low] = mem[indexes[low]];
- case 12:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- high = (1 << high);
- case 11:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- v[low] = mem[indexes[low]];
- case 10:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- high = (1 << high);
- case 9:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- v[low] = mem[indexes[low]];
- case 8:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- high = (1 << high);
- case 7:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- v[low] = mem[indexes[low]];
- case 6:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- high = (1 << high);
- case 5:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- v[low] = mem[indexes[low]];
- case 4:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- high = (1 << high);
- case 3:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- v[low] = mem[indexes[low]];
- case 2:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- case 1:
- low = _bit_scan_forward(bits);
- v[low] = mem[indexes[low]];
- case 0:
- break;
- }
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
- V &v,
- const MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask,
- enable_if<V::Size == 8> = nullarg)
- {
- unsigned int bits = mask.toInt();
- unsigned int low, high = 0;
- switch (Vc::Detail::popcnt8(bits)) {
- case 8:
- v.gather(mem, indexes);
- break;
- case 7:
- low = _bit_scan_forward(bits);
- bits ^= 1 << low;
- v[low] = mem[indexes[low]];
- case 6:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- high = (1 << high);
- case 5:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- v[low] = mem[indexes[low]];
- case 4:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- high = (1 << high);
- case 3:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- v[low] = mem[indexes[low]];
- case 2:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- case 1:
- low = _bit_scan_forward(bits);
- v[low] = mem[indexes[low]];
- case 0:
- break;
- }
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
- V &v,
- const MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask,
- enable_if<V::Size == 4> = nullarg)
- {
- unsigned int bits = mask.toInt();
- unsigned int low, high = 0;
- switch (Vc::Detail::popcnt4(bits)) {
- case 4:
- v.gather(mem, indexes);
- break;
- case 3:
- low = _bit_scan_forward(bits);
- bits ^= 1 << low;
- v[low] = mem[indexes[low]];
- case 2:
- high = _bit_scan_reverse(bits);
- v[high] = mem[indexes[high]];
- case 1:
- low = _bit_scan_forward(bits);
- v[low] = mem[indexes[low]];
- case 0:
- break;
- }
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
- V &v,
- const MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask,
- enable_if<V::Size == 2> = nullarg)
- {
- unsigned int bits = mask.toInt();
- unsigned int low;
- switch (Vc::Detail::popcnt4(bits)) {
- case 2:
- v.gather(mem, indexes);
- break;
- case 1:
- low = _bit_scan_forward(bits);
- v[low] = mem[indexes[low]];
- case 0:
- break;
- }
- }
- }
- }
- #endif
- #ifndef VC_COMMON_SCATTERIMPLEMENTATION_H_
- #define VC_COMMON_SCATTERIMPLEMENTATION_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeScatter(SetIndexZeroT,
- V &v,
- MT *mem,
- IT indexes,
- typename V::MaskArgument mask)
- {
- indexes.setZeroInverted(static_cast<typename IT::Mask>(mask));
- const V tmp(mem, indexes);
- where(mask) | v = tmp;
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeScatter(SimpleLoopT,
- V &v,
- MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask)
- {
- if (Vc_IS_UNLIKELY(mask.isEmpty())) {
- return;
- }
- Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
- if (mask[i])
- mem[indexes[i]] = v[i];
- });
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeScatter(BitScanLoopT,
- V &v,
- MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask)
- {
- size_t bits = mask.toInt();
- while (Vc_IS_LIKELY(bits > 0)) {
- size_t i, j;
- asm("bsf %[bits],%[i]\n\t"
- "bsr %[bits],%[j]\n\t"
- "btr %[i],%[bits]\n\t"
- "btr %[j],%[bits]\n\t"
- : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
- mem[indexes[i]] = v[i];
- mem[indexes[j]] = v[j];
- }
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
- V &v,
- MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask,
- enable_if<V::Size == 16> = nullarg)
- {
- unsigned int bits = mask.toInt();
- unsigned int low, high = 0;
- switch (Vc::Detail::popcnt16(bits)) {
- case 16:
- v.scatter(mem, indexes);
- break;
- case 15:
- low = _bit_scan_forward(bits);
- bits ^= 1 << low;
- mem[indexes[low]] = v[low];
- case 14:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- high = (1 << high);
- case 13:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- mem[indexes[low]] = v[low];
- case 12:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- high = (1 << high);
- case 11:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- mem[indexes[low]] = v[low];
- case 10:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- high = (1 << high);
- case 9:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- mem[indexes[low]] = v[low];
- case 8:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- high = (1 << high);
- case 7:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- mem[indexes[low]] = v[low];
- case 6:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- high = (1 << high);
- case 5:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- mem[indexes[low]] = v[low];
- case 4:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- high = (1 << high);
- case 3:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- mem[indexes[low]] = v[low];
- case 2:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- case 1:
- low = _bit_scan_forward(bits);
- mem[indexes[low]] = v[low];
- case 0:
- break;
- }
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
- V &v,
- MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask,
- enable_if<V::Size == 8> = nullarg)
- {
- unsigned int bits = mask.toInt();
- unsigned int low, high = 0;
- switch (Vc::Detail::popcnt8(bits)) {
- case 8:
- v.scatter(mem, indexes);
- break;
- case 7:
- low = _bit_scan_forward(bits);
- bits ^= 1 << low;
- mem[indexes[low]] = v[low];
- case 6:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- high = (1 << high);
- case 5:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- mem[indexes[low]] = v[low];
- case 4:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- high = (1 << high);
- case 3:
- low = _bit_scan_forward(bits);
- bits ^= high | (1 << low);
- mem[indexes[low]] = v[low];
- case 2:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- case 1:
- low = _bit_scan_forward(bits);
- mem[indexes[low]] = v[low];
- case 0:
- break;
- }
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
- V &v,
- MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask,
- enable_if<V::Size == 4> = nullarg)
- {
- unsigned int bits = mask.toInt();
- unsigned int low, high = 0;
- switch (Vc::Detail::popcnt4(bits)) {
- case 4:
- v.scatter(mem, indexes);
- break;
- case 3:
- low = _bit_scan_forward(bits);
- bits ^= 1 << low;
- mem[indexes[low]] = v[low];
- case 2:
- high = _bit_scan_reverse(bits);
- mem[indexes[high]] = v[high];
- case 1:
- low = _bit_scan_forward(bits);
- mem[indexes[low]] = v[low];
- case 0:
- break;
- }
- }
- template <typename V, typename MT, typename IT>
- Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
- V &v,
- MT *mem,
- const IT &indexes,
- typename V::MaskArgument mask,
- enable_if<V::Size == 2> = nullarg)
- {
- unsigned int bits = mask.toInt();
- unsigned int low;
- switch (Vc::Detail::popcnt4(bits)) {
- case 2:
- v.scatter(mem, indexes);
- break;
- case 1:
- low = _bit_scan_forward(bits);
- mem[indexes[low]] = v[low];
- case 0:
- break;
- }
- }
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- Vc_INTRINSIC SSE::double_m operator==(SSE::double_v a, SSE::double_v b) { return _mm_cmpeq_pd(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: float_m operator==(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpeq_ps(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: int_m operator==(SSE:: int_v a, SSE:: int_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: uint_m operator==(SSE:: uint_v a, SSE:: uint_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: short_m operator==(SSE:: short_v a, SSE:: short_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
- Vc_INTRINSIC SSE::ushort_m operator==(SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
- Vc_INTRINSIC SSE::double_m operator!=(SSE::double_v a, SSE::double_v b) { return _mm_cmpneq_pd(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: float_m operator!=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpneq_ps(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: int_m operator!=(SSE:: int_v a, SSE:: int_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
- Vc_INTRINSIC SSE:: uint_m operator!=(SSE:: uint_v a, SSE:: uint_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
- Vc_INTRINSIC SSE:: short_m operator!=(SSE:: short_v a, SSE:: short_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
- Vc_INTRINSIC SSE::ushort_m operator!=(SSE::ushort_v a, SSE::ushort_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
- Vc_INTRINSIC SSE::double_m operator> (SSE::double_v a, SSE::double_v b) { return _mm_cmpgt_pd(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: float_m operator> (SSE:: float_v a, SSE:: float_v b) { return _mm_cmpgt_ps(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: int_m operator> (SSE:: int_v a, SSE:: int_v b) { return _mm_cmpgt_epi32(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: uint_m operator> (SSE:: uint_v a, SSE:: uint_v b) {
- #ifndef USE_INCORRECT_UNSIGNED_COMPARE
- return SSE::cmpgt_epu32(a.data(), b.data());
- #else
- return _mm_cmpgt_epi32(a.data(), b.data());
- #endif
- }
- Vc_INTRINSIC SSE:: short_m operator> (SSE:: short_v a, SSE:: short_v b) { return _mm_cmpgt_epi16(a.data(), b.data()); }
- Vc_INTRINSIC SSE::ushort_m operator> (SSE::ushort_v a, SSE::ushort_v b) {
- #ifndef USE_INCORRECT_UNSIGNED_COMPARE
- return SSE::cmpgt_epu16(a.data(), b.data());
- #else
- return _mm_cmpgt_epi16(a.data(), b.data());
- #endif
- }
- Vc_INTRINSIC SSE::double_m operator< (SSE::double_v a, SSE::double_v b) { return _mm_cmplt_pd(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: float_m operator< (SSE:: float_v a, SSE:: float_v b) { return _mm_cmplt_ps(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: int_m operator< (SSE:: int_v a, SSE:: int_v b) { return _mm_cmplt_epi32(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: uint_m operator< (SSE:: uint_v a, SSE:: uint_v b) {
- #ifndef USE_INCORRECT_UNSIGNED_COMPARE
- return SSE::cmplt_epu32(a.data(), b.data());
- #else
- return _mm_cmplt_epi32(a.data(), b.data());
- #endif
- }
- Vc_INTRINSIC SSE:: short_m operator< (SSE:: short_v a, SSE:: short_v b) { return _mm_cmplt_epi16(a.data(), b.data()); }
- Vc_INTRINSIC SSE::ushort_m operator< (SSE::ushort_v a, SSE::ushort_v b) {
- #ifndef USE_INCORRECT_UNSIGNED_COMPARE
- return SSE::cmplt_epu16(a.data(), b.data());
- #else
- return _mm_cmplt_epi16(a.data(), b.data());
- #endif
- }
- Vc_INTRINSIC SSE::double_m operator>=(SSE::double_v a, SSE::double_v b) { return _mm_cmpnlt_pd(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: float_m operator>=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpnlt_ps(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: int_m operator>=(SSE:: int_v a, SSE:: int_v b) { return !(a < b); }
- Vc_INTRINSIC SSE:: uint_m operator>=(SSE:: uint_v a, SSE:: uint_v b) { return !(a < b); }
- Vc_INTRINSIC SSE:: short_m operator>=(SSE:: short_v a, SSE:: short_v b) { return !(a < b); }
- Vc_INTRINSIC SSE::ushort_m operator>=(SSE::ushort_v a, SSE::ushort_v b) { return !(a < b); }
- Vc_INTRINSIC SSE::double_m operator<=(SSE::double_v a, SSE::double_v b) { return _mm_cmple_pd(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: float_m operator<=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmple_ps(a.data(), b.data()); }
- Vc_INTRINSIC SSE:: int_m operator<=(SSE:: int_v a, SSE:: int_v b) { return !(a > b); }
- Vc_INTRINSIC SSE:: uint_m operator<=(SSE:: uint_v a, SSE:: uint_v b) { return !(a > b); }
- Vc_INTRINSIC SSE:: short_m operator<=(SSE:: short_v a, SSE:: short_v b) { return !(a > b); }
- Vc_INTRINSIC SSE::ushort_m operator<=(SSE::ushort_v a, SSE::ushort_v b) { return !(a > b); }
- template <typename T>
- Vc_INTRINSIC SSE::Vector<T> operator^(SSE::Vector<T> a, SSE::Vector<T> b)
- {
- return xor_(a.data(), b.data());
- }
- template <typename T>
- Vc_INTRINSIC SSE::Vector<T> operator&(SSE::Vector<T> a, SSE::Vector<T> b)
- {
- return and_(a.data(), b.data());
- }
- template <typename T>
- Vc_INTRINSIC SSE::Vector<T> operator|(SSE::Vector<T> a, SSE::Vector<T> b)
- {
- return or_(a.data(), b.data());
- }
- template <typename T>
- Vc_INTRINSIC SSE::Vector<T> operator+(SSE::Vector<T> a, SSE::Vector<T> b)
- {
- return add(a.data(), b.data(), T());
- }
- template <typename T>
- Vc_INTRINSIC SSE::Vector<T> operator-(SSE::Vector<T> a, SSE::Vector<T> b)
- {
- return sub(a.data(), b.data(), T());
- }
- template <typename T>
- Vc_INTRINSIC SSE::Vector<T> operator*(SSE::Vector<T> a, SSE::Vector<T> b)
- {
- return mul(a.data(), b.data(), T());
- }
- template <typename T>
- Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, SSE::Vector<T>> operator/(
- SSE::Vector<T> a, SSE::Vector<T> b)
- {
- return div(a.data(), b.data(), T());
- }
- template <typename T>
- Vc_INTRINSIC
- enable_if<std::is_same<int, T>::value || std::is_same<uint, T>::value, SSE::Vector<T>>
- operator/(SSE::Vector<T> a, SSE::Vector<T> b)
- {
- return SSE::Vector<T>::generate([&](int i) { return a[i] / b[i]; });
- }
- template <typename T>
- Vc_INTRINSIC enable_if<std::is_same<short, T>::value || std::is_same<ushort, T>::value,
- SSE::Vector<T>>
- operator/(SSE::Vector<T> a, SSE::Vector<T> b)
- {
- using HT = SSE::VectorHelper<T>;
- __m128 lo = _mm_cvtepi32_ps(HT::expand0(a.data()));
- __m128 hi = _mm_cvtepi32_ps(HT::expand1(a.data()));
- lo = _mm_div_ps(lo, _mm_cvtepi32_ps(HT::expand0(b.data())));
- hi = _mm_div_ps(hi, _mm_cvtepi32_ps(HT::expand1(b.data())));
- return HT::concat(_mm_cvttps_epi32(lo), _mm_cvttps_epi32(hi));
- }
- template <typename T>
- Vc_INTRINSIC enable_if<std::is_integral<T>::value, SSE::Vector<T>> operator%(
- SSE::Vector<T> a, SSE::Vector<T> b)
- {
- return a - a / b * b;
- }
- }
- template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerZero)
- : d(HV::zero())
- {
- }
- template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerOne)
- : d(HT::one())
- {
- }
- template <typename T>
- Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
- : d(Detail::load16(Detail::IndexesFromZero<EntryType, Size>(), Aligned))
- {
- #if defined Vc_GCC && Vc_GCC < 0x40903 && defined Vc_IMPL_AVX2
- if (std::is_same<T, short>::value) {
- asm("" ::"x"(d.v()));
- }
- #endif
- }
- template <>
- Vc_INTRINSIC Vector<float, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
- : d(SSE::convert<int, float>(SSE::int_v::IndexesFromZero().data()))
- {
- }
- template <>
- Vc_INTRINSIC Vector<double, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
- : d(SSE::convert<int, double>(SSE::int_v::IndexesFromZero().data()))
- {
- }
- template <typename DstT>
- template <typename SrcT, typename Flags>
- Vc_INTRINSIC typename Vector<DstT, VectorAbi::Sse>::
- #ifndef Vc_MSVC
- template
- #endif
- load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Sse>::load(const SrcT *mem, Flags flags)
- {
- Common::handleLoadPrefetches(mem, flags);
- d.v() = Detail::load<VectorType, DstT>(mem, flags);
- }
- template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero()
- {
- data() = HV::zero();
- }
- template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero(const Mask &k)
- {
- data() = Detail::andnot_(k.data(), data());
- }
- template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZeroInverted(const Mask &k)
- {
- data() = Detail::and_(k.data(), data());
- }
- template<> Vc_INTRINSIC void SSE::double_v::setQnan()
- {
- data() = SSE::_mm_setallone_pd();
- }
- template<> Vc_INTRINSIC void Vector<double, VectorAbi::Sse>::setQnan(const Mask &k)
- {
- data() = _mm_or_pd(data(), k.dataD());
- }
- template<> Vc_INTRINSIC void SSE::float_v::setQnan()
- {
- data() = SSE::_mm_setallone_ps();
- }
- template<> Vc_INTRINSIC void Vector<float, VectorAbi::Sse>::setQnan(const Mask &k)
- {
- data() = _mm_or_ps(data(), k.dataF());
- }
- template <typename T>
- template <typename U, typename Flags, typename>
- Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Flags flags) const
- {
- Common::handleStorePrefetches(mem, flags);
- HV::template store<Flags>(mem, data());
- }
- template <typename T>
- template <typename U, typename Flags, typename>
- Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Mask mask, Flags flags) const
- {
- Common::handleStorePrefetches(mem, flags);
- HV::template store<Flags>(mem, data(), mask.data());
- }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator-() const
- {
- return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
- }
- #ifdef Vc_IMPL_XOP
- template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator<<(const SSE::int_v shift) const { return _mm_sha_epi32(d.v(), shift.d.v()); }
- template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator<<(const SSE::uint_v shift) const { return _mm_shl_epi32(d.v(), shift.d.v()); }
- template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator<<(const SSE::short_v shift) const { return _mm_sha_epi16(d.v(), shift.d.v()); }
- template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator<<(const SSE::ushort_v shift) const { return _mm_shl_epi16(d.v(), shift.d.v()); }
- template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator>>(const SSE::int_v shift) const { return operator<<(-shift); }
- template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator>>(const SSE::uint_v shift) const { return operator<<(-shift); }
- template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator>>(const SSE::short_v shift) const { return operator<<(-shift); }
- template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator>>(const SSE::ushort_v shift) const { return operator<<(-shift); }
- #elif defined Vc_IMPL_AVX2
- template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator<<(const SSE::Vector< int> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
- template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator<<(const SSE::Vector< uint> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
- template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator>>(const SSE::Vector< int> x) const { return _mm_srav_epi32(d.v(), x.d.v()); }
- template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator>>(const SSE::Vector< uint> x) const { return _mm_srlv_epi32(d.v(), x.d.v()); }
- #endif
- template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator>>=(int shift) {
- d.v() = HT::shiftRight(d.v(), shift);
- return *this;
- }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator>>(int shift) const {
- return HT::shiftRight(d.v(), shift);
- }
- template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator<<=(int shift) {
- d.v() = HT::shiftLeft(d.v(), shift);
- return *this;
- }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator<<(int shift) const {
- return HT::shiftLeft(d.v(), shift);
- }
- Vc_INTRINSIC Vc_CONST SSE::float_m isnegative(SSE::float_v x)
- {
- return sse_cast<__m128>(_mm_srai_epi32(
- sse_cast<__m128i>(_mm_and_ps(SSE::_mm_setsignmask_ps(), x.data())), 31));
- }
- Vc_INTRINSIC Vc_CONST SSE::double_m isnegative(SSE::double_v x)
- {
- return Mem::permute<X1, X1, X3, X3>(sse_cast<__m128>(_mm_srai_epi32(
- sse_cast<__m128i>(_mm_and_pd(SSE::_mm_setsignmask_pd(), x.data())), 31)));
- }
- #define Vc_GATHER_IMPL(V_) \
- template <> \
- template <class MT, class IT, int Scale> \
- inline void SSE::V_::gatherImplementation( \
- const Common::GatherArguments<MT, IT, Scale> &args)
- #define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
- Vc_GATHER_IMPL(double_v) { d.v() = _mm_setr_pd(Vc_M(0), Vc_M(1)); }
- Vc_GATHER_IMPL(float_v) { d.v() = _mm_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
- Vc_GATHER_IMPL(int_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
- Vc_GATHER_IMPL(uint_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
- Vc_GATHER_IMPL(short_v)
- {
- d.v() =
- Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
- }
- Vc_GATHER_IMPL(ushort_v)
- {
- d.v() =
- Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
- }
- #undef Vc_M
- #undef Vc_GATHER_IMPL
- template <typename T>
- template <class MT, class IT, int Scale>
- inline void Vector<T, VectorAbi::Sse>::gatherImplementation(
- const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
- {
- const auto *mem = args.address;
- const auto indexes = Scale * args.indexes;
- using Selector = std::integral_constant < Common::GatherScatterImplementation,
- #ifdef Vc_USE_SET_GATHERS
- Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
- #endif
- #ifdef Vc_USE_BSF_GATHERS
- Common::GatherScatterImplementation::BitScanLoop
- #elif defined Vc_USE_POPCNT_BSF_GATHERS
- Common::GatherScatterImplementation::PopcntSwitch
- #else
- Common::GatherScatterImplementation::SimpleLoop
- #endif
- > ;
- Common::executeGather(Selector(), *this, mem, indexes, mask);
- }
- template <typename T>
- template <typename MT, typename IT>
- inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes) const
- {
- Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
- }
- template <typename T>
- template <typename MT, typename IT>
- inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
- {
- using Selector = std::integral_constant < Common::GatherScatterImplementation,
- #ifdef Vc_USE_SET_GATHERS
- Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
- #endif
- #ifdef Vc_USE_BSF_GATHERS
- Common::GatherScatterImplementation::BitScanLoop
- #elif defined Vc_USE_POPCNT_BSF_GATHERS
- Common::GatherScatterImplementation::PopcntSwitch
- #else
- Common::GatherScatterImplementation::SimpleLoop
- #endif
- > ;
- Common::executeScatter(Selector(), *this, mem, indexes, mask);
- }
- template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::partialSum() const
- {
- Vector<T, VectorAbi::Sse> tmp = *this;
- if (Size > 1) tmp += tmp.shifted(-1);
- if (Size > 2) tmp += tmp.shifted(-2);
- if (Size > 4) tmp += tmp.shifted(-4);
- if (Size > 8) tmp += tmp.shifted(-8);
- if (Size > 16) tmp += tmp.shifted(-16);
- return tmp;
- }
- #ifndef Vc_IMPL_SSE4_1
- template<> Vc_INTRINSIC Vc_PURE int SSE::int_v::product() const
- {
- return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
- }
- template<> Vc_INTRINSIC Vc_PURE unsigned int SSE::uint_v::product() const
- {
- return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
- }
- #endif
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::min(MaskArg m) const
- {
- Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::max();
- tmp(m) = *this;
- return tmp.min();
- }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::max(MaskArg m) const
- {
- Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::min();
- tmp(m) = *this;
- return tmp.max();
- }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::product(MaskArg m) const
- {
- Vector<T, VectorAbi::Sse> tmp(Vc::One);
- tmp(m) = *this;
- return tmp.product();
- }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::sum(MaskArg m) const
- {
- Vector<T, VectorAbi::Sse> tmp(Vc::Zero);
- tmp(m) = *this;
- return tmp.sum();
- }
- namespace Detail
- {
- Vc_INTRINSIC Vc_CONST __m128 exponent(__m128 v)
- {
- __m128i tmp = _mm_srli_epi32(_mm_castps_si128(v), 23);
- tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f));
- return _mm_cvtepi32_ps(tmp);
- }
- Vc_INTRINSIC Vc_CONST __m128d exponent(__m128d v)
- {
- __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(v), 52);
- tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff));
- return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08));
- }
- }
- Vc_INTRINSIC Vc_CONST SSE::float_v exponent(SSE::float_v x)
- {
- using Detail::operator>=;
- Vc_ASSERT((x >= x.Zero()).isFull());
- return Detail::exponent(x.data());
- }
- Vc_INTRINSIC Vc_CONST SSE::double_v exponent(SSE::double_v x)
- {
- using Detail::operator>=;
- Vc_ASSERT((x >= x.Zero()).isFull());
- return Detail::exponent(x.data());
- }
- static void _doRandomStep(SSE::uint_v &state0,
- SSE::uint_v &state1)
- {
- using SSE::uint_v;
- using Detail::operator+;
- using Detail::operator*;
- state0.load(&Common::RandomState[0]);
- state1.load(&Common::RandomState[uint_v::Size]);
- (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
- uint_v(_mm_xor_si128((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
- _mm_srli_epi32(state1.data(), 16)))
- .store(&Common::RandomState[0]);
- }
- template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::Random()
- {
- SSE::uint_v state0, state1;
- _doRandomStep(state0, state1);
- return state0.data();
- }
- template<> Vc_ALWAYS_INLINE SSE::float_v SSE::float_v::Random()
- {
- SSE::uint_v state0, state1;
- _doRandomStep(state0, state1);
- return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
- }
- template<> Vc_ALWAYS_INLINE SSE::double_v SSE::double_v::Random()
- {
- typedef unsigned long long uint64 Vc_MAY_ALIAS;
- uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
- uint64 state1 = *reinterpret_cast<const uint64 *>(&Common::RandomState[10]);
- const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Common::RandomState[8]));
- *reinterpret_cast<uint64 *>(&Common::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
- *reinterpret_cast<uint64 *>(&Common::RandomState[10]) = (state1 * 0x5deece66dull + 11);
- return _mm_sub_pd(_mm_or_pd(_mm_castsi128_pd(_mm_srli_epi64(state, 12)), HT::one()), HT::one());
- }
- template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount) const
- {
- enum {
- EntryTypeSizeof = sizeof(EntryType)
- };
- switch (amount) {
- case 0: return *this;
- case 1: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
- case 2: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
- case 3: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
- case 4: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
- case 5: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
- case 6: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
- case 7: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
- case 8: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
- case -1: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
- case -2: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
- case -3: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
- case -4: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
- case -5: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
- case -6: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
- case -7: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
- case -8: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
- }
- return Zero();
- }
- template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount, Vector shiftIn) const
- {
- if (amount >= -int(size())) {
- constexpr int VectorWidth = int(size());
- constexpr int EntryTypeSizeof = sizeof(EntryType);
- const __m128i v0 = sse_cast<__m128i>(d.v());
- const __m128i v1 = sse_cast<__m128i>(shiftIn.d.v());
- auto &&fixup = sse_cast<VectorType, __m128i>;
- switch (amount) {
- case 0: return *this;
- case -1: return fixup(SSE::alignr_epi8<(VectorWidth - 1) * EntryTypeSizeof>(v0, v1));
- case -2: return fixup(SSE::alignr_epi8<(VectorWidth - 2) * EntryTypeSizeof>(v0, v1));
- case -3: return fixup(SSE::alignr_epi8<(VectorWidth - 3) * EntryTypeSizeof>(v0, v1));
- case -4: return fixup(SSE::alignr_epi8<(VectorWidth - 4) * EntryTypeSizeof>(v0, v1));
- case -5: return fixup(SSE::alignr_epi8<(VectorWidth - 5) * EntryTypeSizeof>(v0, v1));
- case -6: return fixup(SSE::alignr_epi8<(VectorWidth - 6) * EntryTypeSizeof>(v0, v1));
- case -7: return fixup(SSE::alignr_epi8<(VectorWidth - 7) * EntryTypeSizeof>(v0, v1));
- case -8: return fixup(SSE::alignr_epi8<(VectorWidth - 8) * EntryTypeSizeof>(v0, v1));
- case -9: return fixup(SSE::alignr_epi8<(VectorWidth - 9) * EntryTypeSizeof>(v0, v1));
- case-10: return fixup(SSE::alignr_epi8<(VectorWidth -10) * EntryTypeSizeof>(v0, v1));
- case-11: return fixup(SSE::alignr_epi8<(VectorWidth -11) * EntryTypeSizeof>(v0, v1));
- case-12: return fixup(SSE::alignr_epi8<(VectorWidth -12) * EntryTypeSizeof>(v0, v1));
- case-13: return fixup(SSE::alignr_epi8<(VectorWidth -13) * EntryTypeSizeof>(v0, v1));
- case-14: return fixup(SSE::alignr_epi8<(VectorWidth -14) * EntryTypeSizeof>(v0, v1));
- case-15: return fixup(SSE::alignr_epi8<(VectorWidth -15) * EntryTypeSizeof>(v0, v1));
- case 1: return fixup(SSE::alignr_epi8< 1 * EntryTypeSizeof>(v1, v0));
- case 2: return fixup(SSE::alignr_epi8< 2 * EntryTypeSizeof>(v1, v0));
- case 3: return fixup(SSE::alignr_epi8< 3 * EntryTypeSizeof>(v1, v0));
- case 4: return fixup(SSE::alignr_epi8< 4 * EntryTypeSizeof>(v1, v0));
- case 5: return fixup(SSE::alignr_epi8< 5 * EntryTypeSizeof>(v1, v0));
- case 6: return fixup(SSE::alignr_epi8< 6 * EntryTypeSizeof>(v1, v0));
- case 7: return fixup(SSE::alignr_epi8< 7 * EntryTypeSizeof>(v1, v0));
- case 8: return fixup(SSE::alignr_epi8< 8 * EntryTypeSizeof>(v1, v0));
- case 9: return fixup(SSE::alignr_epi8< 9 * EntryTypeSizeof>(v1, v0));
- case 10: return fixup(SSE::alignr_epi8<10 * EntryTypeSizeof>(v1, v0));
- case 11: return fixup(SSE::alignr_epi8<11 * EntryTypeSizeof>(v1, v0));
- case 12: return fixup(SSE::alignr_epi8<12 * EntryTypeSizeof>(v1, v0));
- case 13: return fixup(SSE::alignr_epi8<13 * EntryTypeSizeof>(v1, v0));
- case 14: return fixup(SSE::alignr_epi8<14 * EntryTypeSizeof>(v1, v0));
- case 15: return fixup(SSE::alignr_epi8<15 * EntryTypeSizeof>(v1, v0));
- }
- }
- return shiftIn.shifted(int(size()) + amount);
- }
- template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::rotated(int amount) const
- {
- enum {
- EntryTypeSizeof = sizeof(EntryType)
- };
- const __m128i v = SSE::sse_cast<__m128i>(d.v());
- switch (static_cast<unsigned int>(amount) % Size) {
- case 0: return *this;
- case 1: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<1 * EntryTypeSizeof>(v, v));
- case 2: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<2 * EntryTypeSizeof>(v, v));
- case 3: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<3 * EntryTypeSizeof>(v, v));
- case 4: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<4 * EntryTypeSizeof>(v, v));
- case 5: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<5 * EntryTypeSizeof>(v, v));
- case 6: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<6 * EntryTypeSizeof>(v, v));
- case 7: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<7 * EntryTypeSizeof>(v, v));
- }
- return Zero();
- }
- namespace Detail
- {
- inline Vc_CONST SSE::double_v sorted(SSE::double_v x_)
- {
- const __m128d x = x_.data();
- const __m128d y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1));
- return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y));
- }
- }
- template <typename T>
- Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::sorted()
- const
- {
- return Detail::sorted(*this);
- }
- template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveLow (SSE::double_v x) const { return _mm_unpacklo_pd(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveHigh(SSE::double_v x) const { return _mm_unpackhi_pd(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveLow ( SSE::float_v x) const { return _mm_unpacklo_ps(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveHigh( SSE::float_v x) const { return _mm_unpackhi_ps(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveLow ( SSE::int_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveHigh( SSE::int_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveLow ( SSE::uint_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveHigh( SSE::uint_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveLow ( SSE::short_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveHigh( SSE::short_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveLow (SSE::ushort_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
- template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveHigh(SSE::ushort_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
- template <> template <typename G> Vc_INTRINSIC SSE::double_v SSE::double_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- return _mm_setr_pd(tmp0, tmp1);
- }
- template <> template <typename G> Vc_INTRINSIC SSE::float_v SSE::float_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- return _mm_setr_ps(tmp0, tmp1, tmp2, tmp3);
- }
- template <> template <typename G> Vc_INTRINSIC SSE::int_v SSE::int_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
- }
- template <> template <typename G> Vc_INTRINSIC SSE::uint_v SSE::uint_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
- }
- template <> template <typename G> Vc_INTRINSIC SSE::short_v SSE::short_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- const auto tmp4 = gen(4);
- const auto tmp5 = gen(5);
- const auto tmp6 = gen(6);
- const auto tmp7 = gen(7);
- return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
- }
- template <> template <typename G> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- const auto tmp4 = gen(4);
- const auto tmp5 = gen(5);
- const auto tmp6 = gen(6);
- const auto tmp7 = gen(7);
- return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
- }
- template <> Vc_INTRINSIC Vc_PURE SSE::double_v SSE::double_v::reversed() const
- {
- return Mem::permute<X1, X0>(d.v());
- }
- template <> Vc_INTRINSIC Vc_PURE SSE::float_v SSE::float_v::reversed() const
- {
- return Mem::permute<X3, X2, X1, X0>(d.v());
- }
- template <> Vc_INTRINSIC Vc_PURE SSE::int_v SSE::int_v::reversed() const
- {
- return Mem::permute<X3, X2, X1, X0>(d.v());
- }
- template <> Vc_INTRINSIC Vc_PURE SSE::uint_v SSE::uint_v::reversed() const
- {
- return Mem::permute<X3, X2, X1, X0>(d.v());
- }
- template <> Vc_INTRINSIC Vc_PURE SSE::short_v SSE::short_v::reversed() const
- {
- return sse_cast<__m128i>(
- Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
- sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
- }
- template <> Vc_INTRINSIC Vc_PURE SSE::ushort_v SSE::ushort_v::reversed() const
- {
- return sse_cast<__m128i>(
- Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
- sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
- }
- template <>
- Vc_INTRINSIC SSE::float_v SSE::float_v::operator[](const SSE::int_v &
- #ifdef Vc_IMPL_AVX
- perm
- #endif
- ) const
- {
- #ifdef Vc_IMPL_AVX
- return _mm_permutevar_ps(d.v(), perm.data());
- #else
- return *this;
- #endif
- }
- template <> template <int Index> Vc_INTRINSIC SSE::float_v SSE::float_v::broadcast() const
- {
- constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
- return Mem::permute<Inner, Inner, Inner, Inner>(d.v());
- }
- template <> template <int Index> Vc_INTRINSIC SSE::double_v SSE::double_v::broadcast() const
- {
- constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
- return Mem::permute<Inner, Inner>(d.v());
- }
- namespace Common
- {
- Vc_ALWAYS_INLINE void transpose_impl(
- TransposeTag<4, 4>, SSE::float_v *Vc_RESTRICT r[],
- const TransposeProxy<SSE::float_v, SSE::float_v, SSE::float_v, SSE::float_v> &proxy)
- {
- const auto in0 = std::get<0>(proxy.in).data();
- const auto in1 = std::get<1>(proxy.in).data();
- const auto in2 = std::get<2>(proxy.in).data();
- const auto in3 = std::get<3>(proxy.in).data();
- const auto tmp0 = _mm_unpacklo_ps(in0, in2);
- const auto tmp1 = _mm_unpacklo_ps(in1, in3);
- const auto tmp2 = _mm_unpackhi_ps(in0, in2);
- const auto tmp3 = _mm_unpackhi_ps(in1, in3);
- *r[0] = _mm_unpacklo_ps(tmp0, tmp1);
- *r[1] = _mm_unpackhi_ps(tmp0, tmp1);
- *r[2] = _mm_unpacklo_ps(tmp2, tmp3);
- *r[3] = _mm_unpackhi_ps(tmp2, tmp3);
- }
- }
- }
- #ifndef VC_SSE_SIMD_CAST_H_
- #define VC_SSE_SIMD_CAST_H_
- #ifdef Vc_IMPL_AVX
- #ifndef VC_AVX_CASTS_H_
- #define VC_AVX_CASTS_H_
- #ifndef VC_AVX_SHUFFLE_H_
- #define VC_AVX_SHUFFLE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <int... Dst> struct Permutation {};
- template <uint8_t... Sel> struct Mask {};
- #ifdef Vc_IMPL_AVX2
- template <uint8_t Sel0, uint8_t Sel1, uint8_t Sel2, uint8_t Sel3, uint8_t Sel4,
- uint8_t Sel5, uint8_t Sel6, uint8_t Sel7, uint8_t Sel8, uint8_t Sel9,
- uint8_t Sel10, uint8_t Sel11, uint8_t Sel12, uint8_t Sel13, uint8_t Sel14,
- uint8_t Sel15>
- Vc_INTRINSIC Vc_CONST __m256i
- blend(__m256i a, __m256i b, Mask<Sel0, Sel1, Sel2, Sel3, Sel4, Sel5, Sel6, Sel7, Sel8,
- Sel9, Sel10, Sel11, Sel12, Sel13, Sel14, Sel15>)
- {
- static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) &&
- (Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) &&
- (Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) &&
- (Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) &&
- (Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) &&
- (Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) &&
- (Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) &&
- (Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1),
- "Selectors must be 0 or 1 to select the value from a or b");
- constexpr uint8_t mask = static_cast<uint8_t>(
- (Sel0 << 0 ) | (Sel1 << 1 ) | (Sel2 << 2 ) | (Sel3 << 3 ) |
- (Sel4 << 4 ) | (Sel5 << 5 ) | (Sel6 << 6 ) | (Sel7 << 7 ) |
- (Sel8 << 8 ) | (Sel9 << 9 ) | (Sel10 << 10) | (Sel11 << 11) |
- (Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15));
- return _mm256_blend_epi16(a, b, mask);
- }
- #endif
- }
- namespace Mem
- {
- #ifdef Vc_IMPL_AVX2
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) {
- static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
- static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
- return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
- }
- #endif
- template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) {
- static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
- static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
- return _mm256_permute2f128_ps(
- x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
- }
- template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) {
- static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
- static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
- return _mm256_permute2f128_pd(
- x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
- }
- template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) {
- static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
- static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
- #ifdef Vc_IMPL_AVX2
- return _mm256_permute2x128_si256(
- x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
- #else
- return _mm256_permute2f128_si256(
- x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
- #endif
- }
- template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) {
- static_assert(L >= X0 && H >= X0, "Incorrect_Range");
- static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
- return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
- }
- template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) {
- static_assert(L >= X0 && H >= X0, "Incorrect_Range");
- static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
- #ifdef Vc_IMPL_AVX2
- return _mm256_permute2x128_si256(
- x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
- #else
- return _mm256_permute2f128_si256(
- x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
- #endif
- }
- template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) {
- static_assert(L >= X0 && H >= X0, "Incorrect_Range");
- static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
- return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
- static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) {
- return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
- }
- #ifdef Vc_IMPL_AVX2
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- }
- #endif
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
- static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
- static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
- return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
- return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
- static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) {
- static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
- static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
- static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
- static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
- static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
- static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
- static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
- static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
- return _mm256_blend_ps(x, y,
- (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
- (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
- (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
- (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
- );
- }
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
- static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) {
- return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
- }
- template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
- template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
- static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
- static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range");
- static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range");
- static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range");
- static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range");
- static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range");
- static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range");
- static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range");
- static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range");
- if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
- return permute<Dst0, Dst1, Dst2, Dst3>(x);
- }
- const __m128 loIn = _mm256_castps256_ps128(x);
- const __m128 hiIn = _mm256_extractf128_ps(x, 1);
- __m128 lo, hi;
- if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
- lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
- lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
- lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
- } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
- lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
- } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
- lo = _mm_unpacklo_ps(loIn, hiIn);
- } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
- lo = _mm_unpacklo_ps(hiIn, loIn);
- } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
- lo = _mm_unpackhi_ps(loIn, hiIn);
- } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
- lo = _mm_unpackhi_ps(hiIn, loIn);
- } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
- lo = blend<ScaleForBlend<Dst0>::Value, ScaleForBlend<Dst1>::Value,
- ScaleForBlend<Dst2>::Value, ScaleForBlend<Dst3>::Value>(loIn, hiIn);
- }
- if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
- hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
- } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
- hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
- } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
- hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
- } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
- hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
- } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
- hi = _mm_unpacklo_ps(loIn, hiIn);
- } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
- hi = _mm_unpacklo_ps(hiIn, loIn);
- } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
- hi = _mm_unpackhi_ps(loIn, hiIn);
- } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
- hi = _mm_unpackhi_ps(hiIn, loIn);
- } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
- hi = blend<ScaleForBlend<Dst4>::Value, ScaleForBlend<Dst5>::Value,
- ScaleForBlend<Dst6>::Value, ScaleForBlend<Dst7>::Value>(loIn, hiIn);
- }
- return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
- }
- }
- }
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Reg
- {
- template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) {
- static_assert(L >= X0 && H >= X0, "Incorrect_Range");
- static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
- return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
- }
- template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) {
- static_assert(L >= X0 && H >= X0, "Incorrect_Range");
- static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
- #ifdef Vc_IMPL_AVX2
- return _mm256_permute2x128_si256(
- x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
- #else
- return _mm256_permute2f128_si256(
- x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
- #endif
- }
- template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) {
- static_assert(L >= X0 && H >= X0, "Incorrect_Range");
- static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
- return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
- }
- template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
- static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
- }
- template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- }
- template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
- return _mm_permute_pd(x, Dst0 + Dst1 * 2);
- }
- template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
- return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
- }
- template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
- static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
- static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
- return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
- }
- template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
- static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
- static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
- return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
- }
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace AVX
- {
- namespace Casts
- {
- template<typename T> Vc_INTRINSIC_L T avx_cast(__m128 v) Vc_INTRINSIC_R;
- template<typename T> Vc_INTRINSIC_L T avx_cast(__m128i v) Vc_INTRINSIC_R;
- template<typename T> Vc_INTRINSIC_L T avx_cast(__m128d v) Vc_INTRINSIC_R;
- template<typename T> Vc_INTRINSIC_L T avx_cast(__m256 v) Vc_INTRINSIC_R;
- template<typename T> Vc_INTRINSIC_L T avx_cast(__m256i v) Vc_INTRINSIC_R;
- template<typename T> Vc_INTRINSIC_L T avx_cast(__m256d v) Vc_INTRINSIC_R;
- template<> Vc_INTRINSIC __m128 avx_cast(__m128 v) { return v; }
- template<> Vc_INTRINSIC __m128 avx_cast(__m128i v) { return _mm_castsi128_ps(v); }
- template<> Vc_INTRINSIC __m128 avx_cast(__m128d v) { return _mm_castpd_ps(v); }
- template<> Vc_INTRINSIC __m128i avx_cast(__m128 v) { return _mm_castps_si128(v); }
- template<> Vc_INTRINSIC __m128i avx_cast(__m128i v) { return v; }
- template<> Vc_INTRINSIC __m128i avx_cast(__m128d v) { return _mm_castpd_si128(v); }
- template<> Vc_INTRINSIC __m128d avx_cast(__m128 v) { return _mm_castps_pd(v); }
- template<> Vc_INTRINSIC __m128d avx_cast(__m128i v) { return _mm_castsi128_pd(v); }
- template<> Vc_INTRINSIC __m128d avx_cast(__m128d v) { return v; }
- template<> Vc_INTRINSIC __m256 avx_cast(__m128 v) { return _mm256_castps128_ps256(v); }
- template<> Vc_INTRINSIC __m256 avx_cast(__m128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
- template<> Vc_INTRINSIC __m256 avx_cast(__m128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
- template<> Vc_INTRINSIC __m256i avx_cast(__m128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
- template<> Vc_INTRINSIC __m256i avx_cast(__m128i v) { return _mm256_castsi128_si256(v); }
- template<> Vc_INTRINSIC __m256i avx_cast(__m128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
- template<> Vc_INTRINSIC __m256d avx_cast(__m128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
- template<> Vc_INTRINSIC __m256d avx_cast(__m128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
- template<> Vc_INTRINSIC __m256d avx_cast(__m128d v) { return _mm256_castpd128_pd256(v); }
- #if defined Vc_MSVC || defined Vc_CLANG || defined Vc_APPLECLANG
- static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
- static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
- static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
- #else
- static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); }
- static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
- static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
- #endif
- template<> Vc_INTRINSIC __m128 avx_cast(__m256 v) { return _mm256_castps256_ps128(v); }
- template<> Vc_INTRINSIC __m128 avx_cast(__m256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
- template<> Vc_INTRINSIC __m128 avx_cast(__m256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
- template<> Vc_INTRINSIC __m128i avx_cast(__m256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
- template<> Vc_INTRINSIC __m128i avx_cast(__m256i v) { return _mm256_castsi256_si128(v); }
- template<> Vc_INTRINSIC __m128i avx_cast(__m256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
- template<> Vc_INTRINSIC __m128d avx_cast(__m256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
- template<> Vc_INTRINSIC __m128d avx_cast(__m256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
- template<> Vc_INTRINSIC __m128d avx_cast(__m256d v) { return _mm256_castpd256_pd128(v); }
- template<> Vc_INTRINSIC __m256 avx_cast(__m256 v) { return v; }
- template<> Vc_INTRINSIC __m256 avx_cast(__m256i v) { return _mm256_castsi256_ps(v); }
- template<> Vc_INTRINSIC __m256 avx_cast(__m256d v) { return _mm256_castpd_ps(v); }
- template<> Vc_INTRINSIC __m256i avx_cast(__m256 v) { return _mm256_castps_si256(v); }
- template<> Vc_INTRINSIC __m256i avx_cast(__m256i v) { return v; }
- template<> Vc_INTRINSIC __m256i avx_cast(__m256d v) { return _mm256_castpd_si256(v); }
- template<> Vc_INTRINSIC __m256d avx_cast(__m256 v) { return _mm256_castps_pd(v); }
- template<> Vc_INTRINSIC __m256d avx_cast(__m256i v) { return _mm256_castsi256_pd(v); }
- template<> Vc_INTRINSIC __m256d avx_cast(__m256d v) { return v; }
- Vc_INTRINSIC Vc_CONST __m128 lo128(__m256 v) { return avx_cast<__m128>(v); }
- Vc_INTRINSIC Vc_CONST __m128d lo128(__m256d v) { return avx_cast<__m128d>(v); }
- Vc_INTRINSIC Vc_CONST __m128i lo128(__m256i v) { return avx_cast<__m128i>(v); }
- Vc_INTRINSIC Vc_CONST __m128 hi128(__m256 v) { return extract128<1>(v); }
- Vc_INTRINSIC Vc_CONST __m128d hi128(__m256d v) { return extract128<1>(v); }
- Vc_INTRINSIC Vc_CONST __m128i hi128(__m256i v) { return extract128<1>(v); }
- Vc_INTRINSIC Vc_CONST __m256 concat(__m128 a, __m128 b) { return insert128<1>(avx_cast<__m256 >(a), b); }
- Vc_INTRINSIC Vc_CONST __m256d concat(__m128d a, __m128d b) { return insert128<1>(avx_cast<__m256d>(a), b); }
- Vc_INTRINSIC Vc_CONST __m256i concat(__m128i a, __m128i b) { return insert128<1>(avx_cast<__m256i>(a), b); }
- }
- using namespace Casts;
- }
- namespace AVX2
- {
- using namespace AVX::Casts;
- }
- namespace AVX
- {
- template <typename From, typename To> struct ConvertTag {};
- Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag<float , int>) { return _mm256_cvttps_epi32(v); }
- Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, int>) { return _mm256_cvttpd_epi32(v); }
- Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int , int>) { return v; }
- Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint , int>) { return v; }
- Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , int>) {
- #ifdef Vc_IMPL_AVX2
- return _mm256_cvtepi16_epi32(v);
- #else
- return AVX::srai_epi32<16>(
- concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
- #endif
- }
- Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, int>) {
- #ifdef Vc_IMPL_AVX2
- return _mm256_cvtepu16_epi32(v);
- #else
- return AVX::srli_epi32<16>(
- concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
- #endif
- }
- Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag<float , uint>) {
- using namespace AVX;
- return _mm256_castps_si256(_mm256_blendv_ps(
- _mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
- _mm256_castsi256_ps(add_epi32(_mm256_cvttps_epi32(_mm256_sub_ps(v, set2power31_ps())),
- set2power31_epu32())),
- cmpge_ps(v, set2power31_ps())));
- }
- Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, uint>) {
- using namespace AVX;
- return _mm_xor_si128(
- _mm256_cvttpd_epi32(_mm256_sub_pd(_mm256_floor_pd(v), set1_pd(0x80000000u))),
- _mm_set2power31_epu32());
- }
- Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int , uint>) { return v; }
- Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint , uint>) { return v; }
- Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , uint>) {
- #ifdef Vc_IMPL_AVX2
- return _mm256_cvtepi16_epi32(v);
- #else
- return AVX::srai_epi32<16>(
- concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
- #endif
- }
- Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, uint>) {
- #ifdef Vc_IMPL_AVX2
- return _mm256_cvtepu16_epi32(v);
- #else
- return AVX::srli_epi32<16>(
- concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
- #endif
- }
- Vc_INTRINSIC __m256 convert(__m256 v, ConvertTag<float , float>) { return v; }
- Vc_INTRINSIC __m128 convert(__m256d v, ConvertTag<double, float>) { return _mm256_cvtpd_ps(v); }
- Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag<int , float>) { return _mm256_cvtepi32_ps(v); }
- Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag<uint , float>) {
- using namespace AVX;
- return _mm256_blendv_ps(
- _mm256_cvtepi32_ps(v),
- _mm256_add_ps(_mm256_cvtepi32_ps(and_si256(v, set1_epi32(0x7ffffe00))),
- _mm256_add_ps(set2power31_ps(), _mm256_cvtepi32_ps(and_si256(
- v, set1_epi32(0x000001ff))))),
- _mm256_castsi256_ps(cmplt_epi32(v, _mm256_setzero_si256())));
- }
- Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag<short , float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag< short, int>())); }
- Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag<ushort, float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag<ushort, int>())); }
- Vc_INTRINSIC __m256d convert(__m128 v, ConvertTag<float , double>) { return _mm256_cvtps_pd(v); }
- Vc_INTRINSIC __m256d convert(__m256d v, ConvertTag<double, double>) { return v; }
- Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<int , double>) { return _mm256_cvtepi32_pd(v); }
- Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<uint , double>) {
- using namespace AVX;
- return _mm256_add_pd(
- _mm256_cvtepi32_pd(_mm_xor_si128(v, _mm_setmin_epi32())),
- set1_pd(1u << 31)); }
- Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, SSE::ConvertTag< short, int>()), ConvertTag<int, double>()); }
- Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, SSE::ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
- Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int , short>) {
- #ifdef Vc_IMPL_AVX2
- auto a = _mm256_shuffle_epi8(
- v, _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
- -0x80, -0x80, -0x80, 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
- -0x80, -0x80, -0x80, -0x80, -0x80, -0x80));
- return lo128(_mm256_permute4x64_epi64(a, 0xf8));
- #else
- const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
- const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
- const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
- const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
- return _mm_unpacklo_epi16(tmp2, tmp3);
- #endif
- }
- Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint , short>) { return convert(v, ConvertTag<int, short>()); }
- Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag<float , short>) { return convert(convert(v, ConvertTag<float, int>()), ConvertTag<int, short>()); }
- Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, short>) { return convert(convert(v, ConvertTag<double, int>()), SSE::ConvertTag<int, short>()); }
- Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , short>) { return v; }
- Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, short>) { return v; }
- Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int , ushort>) {
- auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
- auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
- auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
- auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
- return _mm_unpacklo_epi16(tmp2, tmp3);
- }
- Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint , ushort>) {
- auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
- auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
- auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
- auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
- return _mm_unpacklo_epi16(tmp2, tmp3);
- }
- Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag<float , ushort>) { return convert(convert(v, ConvertTag<float, uint>()), ConvertTag<uint, ushort>()); }
- Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, uint>()), SSE::ConvertTag<uint, ushort>()); }
- Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , ushort>) { return v; }
- Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, ushort>) { return v; }
- template <typename From, typename To>
- Vc_INTRINSIC auto convert(
- typename std::conditional<(sizeof(From) < sizeof(To)),
- typename SSE::VectorTraits<From>::VectorType,
- typename AVX::VectorTypeHelper<From>::Type>::type v)
- -> decltype(convert(v, ConvertTag<From, To>()))
- {
- return convert(v, ConvertTag<From, To>());
- }
- template <typename From, typename To, typename = enable_if<(sizeof(From) < sizeof(To))>>
- Vc_INTRINSIC auto convert(typename AVX::VectorTypeHelper<From>::Type v)
- -> decltype(convert(lo128(v), ConvertTag<From, To>()))
- {
- return convert(lo128(v), ConvertTag<From, To>());
- }
- }
- }
- #endif
- #endif
- #ifndef VC_SSE_VECTOR_H_
- #error "Vc/sse/vector.h needs to be included before Vc/sse/simd_cast.h"
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SSE
- {
- #define Vc_SIMD_CAST_1(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x, enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_2(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x0, from_ x1, enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_4(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x0, from_ x1, from_ x2, from_ x3, \
- enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_8(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7, \
- enable_if<std::is_same<To, to_>::value> = nullarg)
- Vc_SIMD_CAST_1( float_v, int_v);
- Vc_SIMD_CAST_1(double_v, int_v);
- Vc_SIMD_CAST_1( uint_v, int_v);
- Vc_SIMD_CAST_1( short_v, int_v);
- Vc_SIMD_CAST_1(ushort_v, int_v);
- Vc_SIMD_CAST_1( float_v, uint_v);
- Vc_SIMD_CAST_1(double_v, uint_v);
- Vc_SIMD_CAST_1( int_v, uint_v);
- Vc_SIMD_CAST_1( short_v, uint_v);
- Vc_SIMD_CAST_1(ushort_v, uint_v);
- Vc_SIMD_CAST_1(double_v, float_v);
- Vc_SIMD_CAST_1( int_v, float_v);
- Vc_SIMD_CAST_1( uint_v, float_v);
- Vc_SIMD_CAST_1( short_v, float_v);
- Vc_SIMD_CAST_1(ushort_v, float_v);
- Vc_SIMD_CAST_1( float_v, double_v);
- Vc_SIMD_CAST_1( int_v, double_v);
- Vc_SIMD_CAST_1( uint_v, double_v);
- Vc_SIMD_CAST_1( short_v, double_v);
- Vc_SIMD_CAST_1(ushort_v, double_v);
- Vc_SIMD_CAST_1( int_v, short_v);
- Vc_SIMD_CAST_1( uint_v, short_v);
- Vc_SIMD_CAST_1( float_v, short_v);
- Vc_SIMD_CAST_1(double_v, short_v);
- Vc_SIMD_CAST_1(ushort_v, short_v);
- Vc_SIMD_CAST_1( int_v, ushort_v);
- Vc_SIMD_CAST_1( uint_v, ushort_v);
- Vc_SIMD_CAST_1( float_v, ushort_v);
- Vc_SIMD_CAST_1(double_v, ushort_v);
- Vc_SIMD_CAST_1( short_v, ushort_v);
- Vc_SIMD_CAST_2(double_v, int_v);
- Vc_SIMD_CAST_2(double_v, uint_v);
- Vc_SIMD_CAST_2(double_v, float_v);
- Vc_SIMD_CAST_2( int_v, short_v);
- Vc_SIMD_CAST_2( uint_v, short_v);
- Vc_SIMD_CAST_2( float_v, short_v);
- Vc_SIMD_CAST_2(double_v, short_v);
- Vc_SIMD_CAST_2( int_v, ushort_v);
- Vc_SIMD_CAST_2( uint_v, ushort_v);
- Vc_SIMD_CAST_2( float_v, ushort_v);
- Vc_SIMD_CAST_2(double_v, ushort_v);
- #define Vc_CAST_(To_) \
- template <typename Return> \
- Vc_INTRINSIC Vc_CONST enable_if<std::is_same<Return, To_>::value, Return>
- Vc_CAST_(short_v) simd_cast(double_v a, double_v b, double_v c);
- Vc_CAST_(ushort_v) simd_cast(double_v a, double_v b, double_v c);
- Vc_SIMD_CAST_4(double_v, short_v);
- Vc_SIMD_CAST_4(double_v, ushort_v);
- }
- using SSE::simd_cast;
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::double_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::double_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
- template <typename To, typename FromT>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(SSE::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value> = nullarg);
- #undef Vc_SIMD_CAST_1
- #undef Vc_SIMD_CAST_2
- #undef Vc_SIMD_CAST_4
- #undef Vc_SIMD_CAST_8
- #define Vc_SIMD_CAST_1(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if<std::is_same<To, to_>::value>)
- #define Vc_SIMD_CAST_2(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, \
- enable_if<std::is_same<To, to_>::value>)
- #define Vc_SIMD_CAST_4(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, \
- enable_if<std::is_same<To, to_>::value>)
- #define Vc_SIMD_CAST_8(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
- from_ x5, from_ x6, from_ x7, \
- enable_if<std::is_same<To, to_>::value>)
- namespace SSE
- {
- Vc_INTRINSIC __m128i convert_int32_to_int16(__m128i a, __m128i b)
- {
- auto tmp0 = _mm_unpacklo_epi16(a, b);
- auto tmp1 = _mm_unpackhi_epi16(a, b);
- auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
- auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
- return _mm_unpacklo_epi16(tmp2, tmp3);
- }
- Vc_SIMD_CAST_1( float_v, int_v) { return convert< float, int>(x.data()); }
- Vc_SIMD_CAST_1(double_v, int_v) { return convert<double, int>(x.data()); }
- Vc_SIMD_CAST_1( uint_v, int_v) { return convert< uint, int>(x.data()); }
- Vc_SIMD_CAST_1( short_v, int_v) { return convert< short, int>(x.data()); }
- Vc_SIMD_CAST_1(ushort_v, int_v) { return convert<ushort, int>(x.data()); }
- Vc_SIMD_CAST_1( float_v, uint_v) { return convert< float, uint>(x.data()); }
- Vc_SIMD_CAST_1(double_v, uint_v) { return convert<double, uint>(x.data()); }
- Vc_SIMD_CAST_1( int_v, uint_v) { return convert< int, uint>(x.data()); }
- Vc_SIMD_CAST_1( short_v, uint_v) { return convert< short, uint>(x.data()); }
- Vc_SIMD_CAST_1(ushort_v, uint_v) { return convert<ushort, uint>(x.data()); }
- Vc_SIMD_CAST_1(double_v, float_v) { return convert<double, float>(x.data()); }
- Vc_SIMD_CAST_1( int_v, float_v) { return convert< int, float>(x.data()); }
- Vc_SIMD_CAST_1( uint_v, float_v) { return convert< uint, float>(x.data()); }
- Vc_SIMD_CAST_1( short_v, float_v) { return convert< short, float>(x.data()); }
- Vc_SIMD_CAST_1(ushort_v, float_v) { return convert<ushort, float>(x.data()); }
- Vc_SIMD_CAST_1( float_v, double_v) { return convert< float, double>(x.data()); }
- Vc_SIMD_CAST_1( int_v, double_v) { return convert< int, double>(x.data()); }
- Vc_SIMD_CAST_1( uint_v, double_v) { return convert< uint, double>(x.data()); }
- Vc_SIMD_CAST_1( short_v, double_v) { return convert< short, double>(x.data()); }
- Vc_SIMD_CAST_1(ushort_v, double_v) { return convert<ushort, double>(x.data()); }
- Vc_SIMD_CAST_1( int_v, short_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
- Vc_SIMD_CAST_1( uint_v, short_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
- Vc_SIMD_CAST_1( float_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x).data(), _mm_setzero_si128()); }
- Vc_SIMD_CAST_1(double_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x).data(), _mm_setzero_si128()); }
- Vc_SIMD_CAST_1(ushort_v, short_v) { return x.data(); }
- Vc_SIMD_CAST_1( int_v, ushort_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
- Vc_SIMD_CAST_1( uint_v, ushort_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
- Vc_SIMD_CAST_1( float_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x)); }
- Vc_SIMD_CAST_1(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x)); }
- Vc_SIMD_CAST_1( short_v, ushort_v) { return x.data(); }
- Vc_SIMD_CAST_2(double_v, int_v) {
- #ifdef Vc_IMPL_AVX
- return AVX::convert<double, int>(AVX::concat(x0.data(), x1.data()));
- #else
- return _mm_unpacklo_epi64(convert<double, int>(x0.data()), convert<double, int>(x1.data()));
- #endif
- }
- Vc_SIMD_CAST_2(double_v, uint_v) {
- #ifdef Vc_IMPL_AVX
- return AVX::convert<double, uint>(AVX::concat(x0.data(), x1.data()));
- #else
- return _mm_unpacklo_epi64(convert<double, uint>(x0.data()), convert<double, uint>(x1.data()));
- #endif
- }
- Vc_SIMD_CAST_2(double_v, float_v) {
- #ifdef Vc_IMPL_AVX
- return _mm256_cvtpd_ps(AVX::concat(x0.data(), x1.data()));
- #else
- return _mm_movelh_ps(_mm_cvtpd_ps(x0.data()), _mm_cvtpd_ps(x1.data()));
- #endif
- }
- Vc_SIMD_CAST_2( int_v, short_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2( uint_v, short_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2( float_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0).data(), simd_cast<SSE::int_v>(x1).data()); }
- Vc_SIMD_CAST_2(double_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0, x1).data(), _mm_setzero_si128()); }
- Vc_SIMD_CAST_2( int_v, ushort_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2( uint_v, ushort_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2( float_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0), simd_cast<SSE::int_v>(x1)); }
- Vc_SIMD_CAST_2(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0, x1)); }
- Vc_CAST_(short_v) simd_cast(double_v a, double_v b, double_v c)
- {
- return simd_cast<short_v>(simd_cast<int_v>(a, b), simd_cast<int_v>(c));
- }
- Vc_CAST_(ushort_v) simd_cast(double_v a, double_v b, double_v c)
- {
- return simd_cast<ushort_v>(simd_cast<int_v>(a, b), simd_cast<int_v>(c));
- }
- #undef Vc_CAST_
- Vc_SIMD_CAST_4(double_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0, x1).data(), simd_cast<SSE::int_v>(x2, x3).data()); }
- Vc_SIMD_CAST_4(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0, x1), simd_cast<SSE::int_v>(x2, x3)); }
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::double_v>::value> )
- {
- return _mm_setr_pd(x.data(), 0.);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::float_v>::value> )
- {
- return _mm_setr_ps(x.data(), 0.f, 0.f, 0.f);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::int_v>::value> )
- {
- return _mm_setr_epi32(x.data(), 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::uint_v>::value> )
- {
- return _mm_setr_epi32(uint(x.data()), 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::short_v>::value> )
- {
- return _mm_setr_epi16(
- x.data(), 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> )
- {
- return _mm_setr_epi16(
- x.data(), 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::double_v>::value> )
- {
- return _mm_setr_pd(x0.data(), x1.data());
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::float_v>::value> )
- {
- return _mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::int_v>::value> )
- {
- return _mm_setr_epi32(x0.data(), x1.data(), 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::uint_v>::value> )
- {
- return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), 0,
- 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::short_v>::value> )
- {
- return _mm_setr_epi16(
- x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> )
- {
- return _mm_setr_epi16(
- x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, SSE::float_v>::value>)
- {
- return _mm_setr_ps(x0.data(), x1.data(), x2.data(), 0.f);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, SSE::int_v>::value>)
- {
- return _mm_setr_epi32(x0.data(), x1.data(), x2.data(), 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, SSE::uint_v>::value>)
- {
- return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
- 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, SSE::short_v>::value>)
- {
- return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, SSE::ushort_v>::value>)
- {
- return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::float_v>::value> )
- {
- return _mm_setr_ps(
- x0.data(), x1.data(), x2.data(), x3.data());
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::int_v>::value> )
- {
- return _mm_setr_epi32(
- x0.data(), x1.data(), x2.data(), x3.data());
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::uint_v>::value> )
- {
- return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
- uint(x3.data()));
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::short_v>::value> )
- {
- return _mm_setr_epi16(
- x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> )
- {
- return _mm_setr_epi16(
- x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, SSE::short_v>::value>)
- {
- return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, SSE::ushort_v>::value>)
- {
- return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, SSE::short_v>::value>)
- {
- return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, SSE::ushort_v>::value>)
- {
- return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, enable_if<std::is_same<Return, SSE::short_v>::value>)
- {
- return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, enable_if<std::is_same<Return, SSE::ushort_v>::value>)
- {
- return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- Scalar::Vector<T> x4,
- Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, SSE::short_v>::value> )
- {
- return _mm_setr_epi16(x0.data(),
- x1.data(),
- x2.data(),
- x3.data(),
- x4.data(),
- x5.data(),
- x6.data(),
- x7.data());
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0,
- Scalar::Vector<T> x1,
- Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- Scalar::Vector<T> x4,
- Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, SSE::ushort_v>::value> )
- {
- return _mm_setr_epi16(x0.data(),
- x1.data(),
- x2.data(),
- x3.data(),
- x4.data(),
- x5.data(),
- x6.data(),
- x7.data());
- }
- template <typename To, typename FromT>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(SSE::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value> )
- {
- return static_cast<To>(x[0]);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(SSE::Mask<T> x, enable_if<SSE::is_mask<Return>::value> = nullarg)
- {
- using M = SSE::Mask<T>;
- return {Detail::mask_cast<M::Size, Return::Size, __m128>(x.dataI())};
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast(
- SSE::Mask<T> x0,
- SSE::Mask<T> x1,
- enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 2 == Return::Size> = nullarg)
- {
- return SSE::sse_cast<__m128>(_mm_packs_epi16(x0.dataI(), x1.dataI()));
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast(
- SSE::Mask<T> x0,
- SSE::Mask<T> x1,
- enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 4 == Return::Size> = nullarg)
- {
- return SSE::sse_cast<__m128>(
- _mm_packs_epi16(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_setzero_si128()));
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast(
- SSE::Mask<T> x0,
- SSE::Mask<T> x1,
- SSE::Mask<T> x2,
- SSE::Mask<T> x3,
- enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 4 == Return::Size> = nullarg)
- {
- return SSE::sse_cast<__m128>(_mm_packs_epi16(_mm_packs_epi16(x0.dataI(), x1.dataI()),
- _mm_packs_epi16(x2.dataI(), x3.dataI())));
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Mask<T> x, enable_if<SSE::is_mask<Return>::value> = nullarg)
- {
- Return m(false);
- m[0] = x[0];
- return m;
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Mask<T> x0, Scalar::Mask<T> x1, enable_if<SSE::is_mask<Return>::value> = nullarg)
- {
- Return m(false);
- m[0] = x0[0];
- m[1] = x1[0];
- return m;
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask<T> x0,
- Scalar::Mask<T> x1,
- Scalar::Mask<T> x2,
- Scalar::Mask<T> x3,
- enable_if<SSE::is_mask<Return>::value> = nullarg)
- {
- Return m(false);
- m[0] = x0[0];
- m[1] = x1[0];
- if (Return::Size >= 4) {
- m[2] = x2[0];
- m[3] = x3[0];
- }
- return m;
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask<T> x0,
- Scalar::Mask<T> x1,
- Scalar::Mask<T> x2,
- Scalar::Mask<T> x3,
- Scalar::Mask<T> x4,
- Scalar::Mask<T> x5,
- Scalar::Mask<T> x6,
- Scalar::Mask<T> x7,
- enable_if<SSE::is_mask<Return>::value> = nullarg)
- {
- Return m(false);
- m[0] = x0[0];
- m[1] = x1[0];
- if (Return::Size >= 4) {
- m[2] = x2[0];
- m[3] = x3[0];
- }
- if (Return::Size >= 8) {
- m[4] = x4[0];
- m[5] = x5[0];
- m[6] = x6[0];
- m[7] = x7[0];
- }
- return m;
- }
- template <typename To, typename FromT>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(SSE::Mask<FromT> x, enable_if<Scalar::is_mask<To>::value> = nullarg)
- {
- return static_cast<To>(x[0]);
- }
- template <typename Return, int offset, typename V>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(V &&x, enable_if<offset == 0 && ((SSE::is_vector<Traits::decay<V>>::value &&
- SSE::is_vector<Return>::value) ||
- (SSE::is_mask<Traits::decay<V>>::value &&
- SSE::is_mask<Return>::value))> = nullarg)
- {
- return simd_cast<Return>(x);
- }
- template <typename Return, int offset, typename V>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(V &&x,
- enable_if<offset == 0 && ((Scalar::is_vector<Traits::decay<V>>::value &&
- SSE::is_vector<Return>::value) ||
- (Scalar::is_mask<Traits::decay<V>>::value &&
- SSE::is_mask<Return>::value))> = nullarg)
- {
- return simd_cast<Return>(x);
- }
- template <typename Return, int offset, typename V>
- Vc_INTRINSIC Vc_CONST Return simd_cast(
- V x,
- enable_if<offset != 0 && (SSE::is_vector<Return>::value && SSE::is_vector<V>::value)> = nullarg)
- {
- constexpr int shift = (sizeof(V) / V::Size) * offset * Return::Size;
- static_assert(shift > 0 && shift < 16, "");
- return simd_cast<Return>(V{SSE::sse_cast<typename V::VectorType>(
- _mm_srli_si128(SSE::sse_cast<__m128i>(x.data()), shift & 0xff))});
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(SSE::Vector<T> x,
- enable_if<offset != 0 && Scalar::is_vector<Return>::value> = nullarg)
- {
- return static_cast<typename Return::EntryType>(x[offset]);
- }
- template <typename Return, int offset, typename V>
- Vc_INTRINSIC Vc_CONST Return simd_cast(
- V x,
- enable_if<offset != 0 && (SSE::is_mask<Return>::value && SSE::is_mask<V>::value)> = nullarg)
- {
- constexpr int shift = (sizeof(V) / V::Size) * offset * Return::Size;
- static_assert(shift > 0 && shift < 16, "");
- return simd_cast<Return>(V{SSE::sse_cast<typename V::VectorType>(
- _mm_srli_si128(SSE::sse_cast<__m128i>(x.data()), shift & 0xff))});
- }
- #undef Vc_SIMD_CAST_1
- #undef Vc_SIMD_CAST_2
- #undef Vc_SIMD_CAST_4
- #undef Vc_SIMD_CAST_8
- }
- #endif
- #endif
- #endif
- #ifdef Vc_IMPL_AVX
- #ifndef VC_AVX_VECTOR_H_
- #define VC_AVX_VECTOR_H_
- #ifndef VC_AVX_VECTORHELPER_H_
- #define VC_AVX_VECTORHELPER_H_
- #include <limits>
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace AVX
- {
- template<> struct VectorHelper<__m256>
- {
- typedef __m256 VectorType;
- typedef const VectorType VTArg;
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_ps(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_ps(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
- };
- template<> struct VectorHelper<__m256d>
- {
- typedef __m256d VectorType;
- typedef const VectorType VTArg;
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_pd(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_pd(mem, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
- template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
- };
- template<> struct VectorHelper<__m256i>
- {
- typedef __m256i VectorType;
- typedef const VectorType VTArg;
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
- template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
- };
- #define Vc_OP1(op) \
- static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); }
- #define Vc_OP(op) \
- static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); }
- #define Vc_OP_(op) \
- static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op , Vc_SUFFIX)(a, b); }
- #define Vc_OPx(op,op2) \
- static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); }
- template<> struct VectorHelper<double> {
- typedef __m256d VectorType;
- typedef const VectorType VTArg;
- typedef double EntryType;
- #define Vc_SUFFIX pd
- static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); }
- static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
- static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
- return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d);
- }
- static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }
- static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
- #ifdef Vc_IMPL_FMA4
- v1 = _mm256_macc_pd(v1, v2, v3);
- #else
- VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
- VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
- #if defined(Vc_GCC) && Vc_GCC < 0x40703
- asm("":"+x"(h1), "+x"(h2));
- #endif
- const VectorType l1 = _mm256_sub_pd(v1, h1);
- const VectorType l2 = _mm256_sub_pd(v2, h2);
- const VectorType ll = mul(l1, l2);
- const VectorType lh = add(mul(l1, h2), mul(h1, l2));
- const VectorType hh = mul(h1, h2);
- const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3));
- const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
- const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
- v1 = add(add(ll, b), add(c, hh));
- #endif
- }
- static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); }
- static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); }
- static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); }
- Vc_OP1(sqrt)
- static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
- return _mm256_div_pd(one(), sqrt(x));
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
- return _mm256_div_pd(one(), x);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
- return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd());
- }
- static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); }
- static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
- __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
- b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
- return _mm_cvtsd_f64(b);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
- __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
- b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
- return _mm_cvtsd_f64(b);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
- __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
- b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
- return _mm_cvtsd_f64(b);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
- __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
- b = _mm_hadd_pd(b, b);
- return _mm_cvtsd_f64(b);
- }
- #undef Vc_SUFFIX
- static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
- return _mm256_round_pd(a, _MM_FROUND_NINT);
- }
- };
- template<> struct VectorHelper<float> {
- typedef float EntryType;
- typedef __m256 VectorType;
- typedef const VectorType VTArg;
- #define Vc_SUFFIX ps
- static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
- const float e, const float f, const float g, const float h) {
- return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }
- static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
- static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
- #ifdef Vc_IMPL_FMA4
- v1 = _mm256_macc_ps(v1, v2, v3);
- #else
- __m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
- __m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
- __m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
- __m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
- __m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
- __m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
- v1 = AVX::concat(
- _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
- _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
- #endif
- }
- static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); }
- static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); }
- static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); }
- Vc_OP1(sqrt) Vc_OP1(rsqrt)
- static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
- return _mm256_rcp_ps(x);
- }
- static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
- return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps());
- }
- static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); }
- static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
- __m128 b = _mm_min_ps(lo128(a), hi128(a));
- b = _mm_min_ps(b, _mm_movehl_ps(b, b));
- b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtss_f32(b);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
- __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
- b = _mm_max_ps(b, _mm_movehl_ps(b, b));
- b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
- return _mm_cvtss_f32(b);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
- __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
- b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
- b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
- return _mm_cvtss_f32(b);
- }
- static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
- __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
- b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
- b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
- return _mm_cvtss_f32(b);
- }
- #undef Vc_SUFFIX
- static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
- return _mm256_round_ps(a, _MM_FROUND_NINT);
- }
- };
- #undef Vc_OP1
- #undef Vc_OP
- #undef Vc_OP_
- #undef Vc_OPx
- }
- }
- #endif
- #ifndef VC_AVX_MASK_H_
- #define VC_AVX_MASK_H_
- #include <array>
- #ifndef VC_AVX_DETAIL_H_
- #define VC_AVX_DETAIL_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <typename Flags>
- Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
- typename Flags::EnableIfAligned = nullptr)
- {
- return _mm256_load_ps(x);
- }
- template <typename Flags>
- Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
- typename Flags::EnableIfUnaligned = nullptr)
- {
- return _mm256_loadu_ps(x);
- }
- template <typename Flags>
- Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
- typename Flags::EnableIfStreaming = nullptr)
- {
- return AvxIntrinsics::stream_load<__m256>(x);
- }
- template <typename Flags>
- Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
- typename Flags::EnableIfAligned = nullptr)
- {
- return _mm256_load_pd(x);
- }
- template <typename Flags>
- Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
- typename Flags::EnableIfUnaligned = nullptr)
- {
- return _mm256_loadu_pd(x);
- }
- template <typename Flags>
- Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
- typename Flags::EnableIfStreaming = nullptr)
- {
- return AvxIntrinsics::stream_load<__m256d>(x);
- }
- template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
- Vc_INTRINSIC Vc_PURE __m256i
- load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfAligned = nullptr)
- {
- return _mm256_load_si256(reinterpret_cast<const __m256i *>(x));
- }
- template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
- Vc_INTRINSIC Vc_PURE __m256i
- load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfUnaligned = nullptr)
- {
- return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(x));
- }
- template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
- Vc_INTRINSIC Vc_PURE __m256i
- load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfStreaming = nullptr)
- {
- return AvxIntrinsics::stream_load<__m256i>(x);
- }
- Vc_INTRINSIC __m256 load32(const float *mem, when_aligned)
- {
- return _mm256_load_ps(mem);
- }
- Vc_INTRINSIC __m256 load32(const float *mem, when_unaligned)
- {
- return _mm256_loadu_ps(mem);
- }
- Vc_INTRINSIC __m256 load32(const float *mem, when_streaming)
- {
- return AvxIntrinsics::stream_load<__m256>(mem);
- }
- Vc_INTRINSIC __m256d load32(const double *mem, when_aligned)
- {
- return _mm256_load_pd(mem);
- }
- Vc_INTRINSIC __m256d load32(const double *mem, when_unaligned)
- {
- return _mm256_loadu_pd(mem);
- }
- Vc_INTRINSIC __m256d load32(const double *mem, when_streaming)
- {
- return AvxIntrinsics::stream_load<__m256d>(mem);
- }
- template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_aligned)
- {
- static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
- return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
- }
- template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_unaligned)
- {
- static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
- return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
- }
- template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_streaming)
- {
- static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
- return AvxIntrinsics::stream_load<__m256i>(mem);
- }
- #ifdef Vc_MSVC
- Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, LoadTag<__m256i, int>)
- {
- return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
- }
- Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, LoadTag<__m256d, double>)
- {
- return _mm256_loadu_pd(mem);
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256 load(const float *mem, when_aligned,
- enable_if<(std::is_same<DstT, float>::value &&
- std::is_same<V, __m256>::value)> = nullarg)
- {
- return _mm256_load_ps(mem);
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256 load(const float *mem, when_unaligned,
- enable_if<(std::is_same<DstT, float>::value &&
- std::is_same<V, __m256>::value)> = nullarg)
- {
- return _mm256_loadu_ps(mem);
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256 load(const float *mem, when_streaming,
- enable_if<(std::is_same<DstT, float>::value &&
- std::is_same<V, __m256>::value)> = nullarg)
- {
- return AvxIntrinsics::stream_load<__m256>(mem);
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256d load(const double *mem, when_aligned,
- enable_if<(std::is_same<DstT, double>::value &&
- std::is_same<V, __m256d>::value)> = nullarg)
- {
- return _mm256_load_pd(mem);
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256d load(const double *mem, when_unaligned,
- enable_if<(std::is_same<DstT, double>::value &&
- std::is_same<V, __m256d>::value)> = nullarg)
- {
- return _mm256_loadu_pd(mem);
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256d load(const double *mem, when_streaming,
- enable_if<(std::is_same<DstT, double>::value &&
- std::is_same<V, __m256d>::value)> = nullarg)
- {
- return AvxIntrinsics::stream_load<__m256d>(mem);
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const uint *mem, when_aligned,
- enable_if<(std::is_same<DstT, uint>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const uint *mem, when_unaligned,
- enable_if<(std::is_same<DstT, uint>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const uint *mem, when_streaming,
- enable_if<(std::is_same<DstT, uint>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return AvxIntrinsics::stream_load<__m256i>(mem);
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const int *mem, when_unaligned,
- enable_if<(std::is_same<DstT, int>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const int *mem, when_aligned,
- enable_if<(std::is_same<DstT, int>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const int *mem, when_streaming,
- enable_if<(std::is_same<DstT, int>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return AvxIntrinsics::stream_load<__m256i>(mem);
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const short *mem, when_unaligned,
- enable_if<(std::is_same<DstT, short>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const short *mem, when_aligned,
- enable_if<(std::is_same<DstT, short>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const short *mem, when_streaming,
- enable_if<(std::is_same<DstT, short>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return AvxIntrinsics::stream_load<__m256i>(mem);
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned,
- enable_if<(std::is_same<DstT, ushort>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned,
- enable_if<(std::is_same<DstT, ushort>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
- }
- template <typename V, typename DstT>
- Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming,
- enable_if<(std::is_same<DstT, ushort>::value &&
- std::is_same<V, __m256i>::value)> = nullarg)
- {
- return AvxIntrinsics::stream_load<__m256i>(mem);
- }
- #endif
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, short>)
- {
- return load32(mem, f);
- }
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, short>)
- {
- return AVX::cvtepu8_epi16(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const schar *mem, Flags f, LoadTag<__m256i, short>)
- {
- return AVX::cvtepi8_epi16(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, ushort>)
- {
- return AVX::cvtepu8_epi16(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const uint *mem, Flags f, LoadTag<__m256i, int>)
- {
- return load32(mem, f);
- }
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, int>)
- {
- return AVX::cvtepu16_epi32(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const short *mem, Flags f, LoadTag<__m256i, int>)
- {
- return AVX::cvtepi16_epi32(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, int>)
- {
- return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const schar *mem, Flags, LoadTag<__m256i, int>)
- {
- return AVX::cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, uint>)
- {
- return AVX::cvtepu16_epi32(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, uint>)
- {
- return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256d load(const float *mem, Flags f, LoadTag<__m256d, double>)
- {
- return AVX::convert<float, double>(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256d load(const uint *mem, Flags f, LoadTag<__m256d, double>)
- {
- return AVX::convert<uint, double>(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256d load(const int *mem, Flags f, LoadTag<__m256d, double>)
- {
- return AVX::convert<int, double>(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256d load(const ushort *mem, Flags f, LoadTag<__m256d, double>)
- {
- return AVX::convert<int, double>(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256d load(const short *mem, Flags f, LoadTag<__m256d, double>)
- {
- return AVX::convert<int, double>(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256d load(const uchar *mem, Flags f, LoadTag<__m256d, double>)
- {
- return AVX::convert<int, double>(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256d load(const schar *mem, Flags f, LoadTag<__m256d, double>)
- {
- return AVX::convert<int, double>(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256 load(const double *mem, Flags f, LoadTag<__m256, float>)
- {
- return AVX::concat(_mm256_cvtpd_ps(load32(&mem[0], f)),
- _mm256_cvtpd_ps(load32(&mem[4], f)));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256 load(const uint *mem, Flags f, LoadTag<__m256, float>)
- {
- const auto v = load32(mem, f);
- return _mm256_blendv_ps(
- _mm256_cvtepi32_ps(v),
- _mm256_add_ps(_mm256_cvtepi32_ps(AVX::sub_epi32(v, AVX::set2power31_epu32())),
- AVX::set2power31_ps()),
- _mm256_castsi256_ps(AVX::cmplt_epi32(v, _mm256_setzero_si256())));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256 load(const int *mem, Flags f, LoadTag<__m256, float>)
- {
- return AVX::convert<int, float>(load32(mem, f));
- }
- template <typename T, typename Flags,
- typename = enable_if<!std::is_same<T, float>::value>>
- Vc_INTRINSIC __m256 load(const T *mem, Flags f, LoadTag<__m256, float>)
- {
- return _mm256_cvtepi32_ps(load<__m256i, int>(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256 load(const ushort *mem, Flags f, LoadTag<__m256, float>)
- {
- return AVX::convert<ushort, float>(load16(mem, f));
- }
- template <typename Flags>
- Vc_INTRINSIC __m256 load(const short *mem, Flags f, LoadTag<__m256, float>)
- {
- return AVX::convert<short, float>(load16(mem, f));
- }
- template <int amount, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount >= 16), T> shifted(T k)
- {
- return AVX::avx_cast<T>(AVX::zeroExtend(
- _mm_srli_si128(AVX::hi128(AVX::avx_cast<__m256i>(k)), amount - 16)));
- }
- template <int amount, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > 0 && amount < 16), T>
- shifted(T k)
- {
- return AVX::avx_cast<T>(
- AVX::alignr<amount>(Mem::permute128<X1, Const0>(AVX::avx_cast<__m256i>(k)),
- AVX::avx_cast<__m256i>(k)));
- }
- template <int amount, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount <= -16), T> shifted(T k)
- {
- return AVX::avx_cast<T>(Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(
- _mm_slli_si128(AVX::lo128(AVX::avx_cast<__m256i>(k)), -16 - amount))));
- }
- template <int amount, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > -16 && amount < 0), T>
- shifted(T k)
- {
- return AVX::avx_cast<T>(
- AVX::alignr<16 + amount>(AVX::avx_cast<__m256i>(k),
- Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(k))));
- }
- template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m256i k)
- {
- static_assert(From == To, "Incorrect mask cast.");
- static_assert(std::is_same<R, __m256>::value, "Incorrect mask cast.");
- return AVX::avx_cast<__m256>(k);
- }
- template <> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 4, __m128>(__m256i k)
- {
- return AVX::avx_cast<__m128>(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)));
- }
- template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 4, __m256>(__m128i k)
- {
- const auto kk = _mm_castsi128_ps(k);
- return AVX::concat(_mm_unpacklo_ps(kk, kk), _mm_unpackhi_ps(kk, kk));
- }
- template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m256i k)
- {
- return AVX::avx_cast<__m256>(AVX::concat(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)),
- _mm_setzero_si128()));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m256i k)
- {
- return AVX::avx_cast<__m128>(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128()));
- }
- template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m128i k)
- {
- return AVX::zeroExtend(AVX::avx_cast<__m128>(k));
- }
- template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 16, __m256>(__m256i k)
- {
- return AVX::zeroExtend(mask_cast<4, 8, __m128>(k));
- }
- template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m256i k)
- {
- const auto lo = AVX::lo128(AVX::avx_cast<__m256>(k));
- return AVX::concat(_mm_unpacklo_ps(lo, lo),
- _mm_unpackhi_ps(lo, lo));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m256i k)
- {
- return AVX::avx_cast<__m128>(AVX::lo128(k));
- }
- template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m128i k)
- {
- const auto tmp = _mm_unpacklo_epi16(k, k);
- return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(tmp, tmp),
- _mm_unpackhi_epi32(tmp, tmp)));
- }
- template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 8, __m128>(__m256i k)
- {
- return AVX::avx_cast<__m128>(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
- }
- template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 8, __m256>(__m128i k)
- {
- return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(k, k),
- _mm_unpackhi_epi16(k, k)));
- }
- template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 16, __m256>(__m256i k)
- {
- return AVX::zeroExtend(mask_cast<8, 8, __m128>(k));
- }
- #ifdef Vc_IMPL_AVX2
- template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 8, __m256>(__m256i k)
- {
- const auto flipped = Mem::permute4x64<X0, X2, X1, X3>(k);
- return _mm256_castsi256_ps(AVX::unpacklo_epi16(flipped, flipped));
- }
- #endif
- template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 4, __m256>(__m256i k)
- {
- const auto tmp = _mm_unpacklo_epi16(AVX::lo128(k), AVX::lo128(k));
- return _mm256_castsi256_ps(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)));
- }
- template<> Vc_INTRINSIC Vc_CONST __m256 allone<__m256 >() { return AVX::setallone_ps(); }
- template<> Vc_INTRINSIC Vc_CONST __m256i allone<__m256i>() { return AVX::setallone_si256(); }
- template<> Vc_INTRINSIC Vc_CONST __m256d allone<__m256d>() { return AVX::setallone_pd(); }
- template<> Vc_INTRINSIC Vc_CONST __m256 zero<__m256 >() { return _mm256_setzero_ps(); }
- template<> Vc_INTRINSIC Vc_CONST __m256i zero<__m256i>() { return _mm256_setzero_si256(); }
- template<> Vc_INTRINSIC Vc_CONST __m256d zero<__m256d>() { return _mm256_setzero_pd(); }
- Vc_INTRINSIC Vc_CONST __m256 one( float) { return AVX::setone_ps (); }
- Vc_INTRINSIC Vc_CONST __m256d one(double) { return AVX::setone_pd (); }
- Vc_INTRINSIC Vc_CONST __m256i one( int) { return AVX::setone_epi32(); }
- Vc_INTRINSIC Vc_CONST __m256i one( uint) { return AVX::setone_epu32(); }
- Vc_INTRINSIC Vc_CONST __m256i one( short) { return AVX::setone_epi16(); }
- Vc_INTRINSIC Vc_CONST __m256i one(ushort) { return AVX::setone_epu16(); }
- Vc_INTRINSIC Vc_CONST __m256i one( schar) { return AVX::setone_epi8 (); }
- Vc_INTRINSIC Vc_CONST __m256i one( uchar) { return AVX::setone_epu8 (); }
- Vc_ALWAYS_INLINE Vc_CONST __m256 negate(__m256 v, std::integral_constant<std::size_t, 4>)
- {
- return _mm256_xor_ps(v, AVX::setsignmask_ps());
- }
- Vc_ALWAYS_INLINE Vc_CONST __m256d negate(__m256d v, std::integral_constant<std::size_t, 8>)
- {
- return _mm256_xor_pd(v, AVX::setsignmask_pd());
- }
- Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 4>)
- {
- return AVX::sign_epi32(v, Detail::allone<__m256i>());
- }
- Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 2>)
- {
- return AVX::sign_epi16(v, Detail::allone<__m256i>());
- }
- Vc_INTRINSIC __m256 xor_(__m256 a, __m256 b) { return _mm256_xor_ps(a, b); }
- Vc_INTRINSIC __m256d xor_(__m256d a, __m256d b) { return _mm256_xor_pd(a, b); }
- Vc_INTRINSIC __m256i xor_(__m256i a, __m256i b)
- {
- #ifdef Vc_IMPL_AVX2
- return _mm256_xor_si256(a, b);
- #else
- return _mm256_castps_si256(
- _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
- #endif
- }
- Vc_INTRINSIC __m256 or_(__m256 a, __m256 b) { return _mm256_or_ps(a, b); }
- Vc_INTRINSIC __m256d or_(__m256d a, __m256d b) { return _mm256_or_pd(a, b); }
- Vc_INTRINSIC __m256i or_(__m256i a, __m256i b)
- {
- #ifdef Vc_IMPL_AVX2
- return _mm256_or_si256(a, b);
- #else
- return _mm256_castps_si256(
- _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
- #endif
- }
- Vc_INTRINSIC __m256 and_(__m256 a, __m256 b) { return _mm256_and_ps(a, b); }
- Vc_INTRINSIC __m256d and_(__m256d a, __m256d b) { return _mm256_and_pd(a, b); }
- Vc_INTRINSIC __m256i and_(__m256i a, __m256i b) {
- #ifdef Vc_IMPL_AVX2
- return _mm256_and_si256(a, b);
- #else
- return _mm256_castps_si256(
- _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
- #endif
- }
- Vc_INTRINSIC __m256 andnot_(__m256 a, __m256 b) { return _mm256_andnot_ps(a, b); }
- Vc_INTRINSIC __m256d andnot_(__m256d a, __m256d b) { return _mm256_andnot_pd(a, b); }
- Vc_INTRINSIC __m256i andnot_(__m256i a, __m256i b)
- {
- #ifdef Vc_IMPL_AVX2
- return _mm256_andnot_si256(a, b);
- #else
- return _mm256_castps_si256(
- _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
- #endif
- }
- Vc_INTRINSIC __m256 not_(__m256 a) { return andnot_(a, allone<__m256 >()); }
- Vc_INTRINSIC __m256d not_(__m256d a) { return andnot_(a, allone<__m256d>()); }
- Vc_INTRINSIC __m256i not_(__m256i a) { return andnot_(a, allone<__m256i>()); }
- Vc_INTRINSIC __m256 blend(__m256 a, __m256 b, __m256 c) { return _mm256_blendv_ps(a, b, c); }
- Vc_INTRINSIC __m256d blend(__m256d a, __m256d b, __m256d c) { return _mm256_blendv_pd(a, b, c); }
- Vc_INTRINSIC __m256i blend(__m256i a, __m256i b, __m256i c) { return AVX::blendv_epi8(a, b, c); }
- Vc_INTRINSIC __m256 abs(__m256 a, float) { return and_(a, AVX::setabsmask_ps()); }
- Vc_INTRINSIC __m256d abs(__m256d a, double) { return and_(a, AVX::setabsmask_pd()); }
- Vc_INTRINSIC __m256i abs(__m256i a, int) { return AVX::abs_epi32(a); }
- Vc_INTRINSIC __m256i abs(__m256i a, uint) { return a; }
- Vc_INTRINSIC __m256i abs(__m256i a, short) { return AVX::abs_epi16(a); }
- Vc_INTRINSIC __m256i abs(__m256i a, ushort) { return a; }
- Vc_INTRINSIC __m256i abs(__m256i a, schar) { return AVX::abs_epi8 (a); }
- Vc_INTRINSIC __m256i abs(__m256i a, uchar) { return a; }
- Vc_INTRINSIC __m256 add(__m256 a, __m256 b, float) { return _mm256_add_ps(a, b); }
- Vc_INTRINSIC __m256d add(__m256d a, __m256d b, double) { return _mm256_add_pd(a, b); }
- Vc_INTRINSIC __m256i add(__m256i a, __m256i b, int) { return AVX::add_epi32(a, b); }
- Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uint) { return AVX::add_epi32(a, b); }
- Vc_INTRINSIC __m256i add(__m256i a, __m256i b, short) { return AVX::add_epi16(a, b); }
- Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); }
- Vc_INTRINSIC __m256 sub(__m256 a, __m256 b, float) { return _mm256_sub_ps(a, b); }
- Vc_INTRINSIC __m256d sub(__m256d a, __m256d b, double) { return _mm256_sub_pd(a, b); }
- Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, int) { return AVX::sub_epi32(a, b); }
- Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uint) { return AVX::sub_epi32(a, b); }
- Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, short) { return AVX::sub_epi16(a, b); }
- Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); }
- Vc_INTRINSIC __m256 mul(__m256 a, __m256 b, float) { return _mm256_mul_ps(a, b); }
- Vc_INTRINSIC __m256d mul(__m256d a, __m256d b, double) { return _mm256_mul_pd(a, b); }
- Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, int) { return AVX::mullo_epi32(a, b); }
- Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, uint) { return AVX::mullo_epi32(a, b); }
- Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, short) { return AVX::mullo_epi16(a, b); }
- Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, ushort) { return AVX::mullo_epi16(a, b); }
- Vc_INTRINSIC __m256 div(__m256 a, __m256 b, float) { return _mm256_div_ps(a, b); }
- Vc_INTRINSIC __m256d div(__m256d a, __m256d b, double) { return _mm256_div_pd(a, b); }
- Vc_INTRINSIC __m256i div(__m256i a, __m256i b, int) {
- using namespace AVX;
- const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
- const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
- const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
- const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
- return concat(_mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
- _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)));
- }
- Vc_INTRINSIC __m256i div(__m256i a, __m256i b, uint) {
- using namespace AVX;
- const __m256i aa = add_epi32(a, set1_epi32(-2147483648));
- const __m256i bb = add_epi32(b, set1_epi32(-2147483648));
- const __m256d loa = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(aa)), set1_pd(2147483648.));
- const __m256d hia = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(aa)), set1_pd(2147483648.));
- const __m256d lob = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(bb)), set1_pd(2147483648.));
- const __m256d hib = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(bb)), set1_pd(2147483648.));
- return avx_cast<__m256i>(_mm256_blendv_ps(
- avx_cast<__m256>(concat(_mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
- _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)))),
- avx_cast<__m256>(a),
- avx_cast<__m256>(cmpeq_epi32(b, setone_epi32()))));
- }
- Vc_INTRINSIC __m256i div(__m256i a, __m256i b, short) {
- using namespace AVX;
- const __m256 lo =
- _mm256_div_ps(convert<short, float>(lo128(a)), convert<short, float>(lo128(b)));
- const __m256 hi =
- _mm256_div_ps(convert<short, float>(hi128(a)), convert<short, float>(hi128(b)));
- return concat(convert<float, short>(lo), convert<float, short>(hi));
- }
- template <typename T> Vc_INTRINSIC T add(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
- {
- return {add(add(AVX::lo128(a), AVX::hi128(a), T()), T())};
- }
- template <typename T> Vc_INTRINSIC T mul(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
- {
- return {mul(mul(AVX::lo128(a), AVX::hi128(a), T()), T())};
- }
- template <typename T> Vc_INTRINSIC T min(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
- {
- return {min(min(AVX::lo128(a), AVX::hi128(a), T()), T())};
- }
- template <typename T> Vc_INTRINSIC T max(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
- {
- return {max(max(AVX::lo128(a), AVX::hi128(a), T()), T())};
- }
- Vc_INTRINSIC __m256 cmpeq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpeq_ps(a, b); }
- Vc_INTRINSIC __m256d cmpeq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpeq_pd(a, b); }
- Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, int) { return AvxIntrinsics::cmpeq_epi32(a, b); }
- Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uint) { return AvxIntrinsics::cmpeq_epi32(a, b); }
- Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, short) { return AvxIntrinsics::cmpeq_epi16(a, b); }
- Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); }
- Vc_INTRINSIC __m256 cmpneq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpneq_ps(a, b); }
- Vc_INTRINSIC __m256d cmpneq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpneq_pd(a, b); }
- Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, int) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
- Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uint) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
- Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, short) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
- Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, ushort) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
- Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, schar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
- Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uchar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
- Vc_INTRINSIC __m256 cmpgt(__m256 a, __m256 b, float) { return AVX::cmpgt_ps(a, b); }
- Vc_INTRINSIC __m256d cmpgt(__m256d a, __m256d b, double) { return AVX::cmpgt_pd(a, b); }
- Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(a, b); }
- Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(a, b); }
- Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(a, b); }
- Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(a, b); }
- Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (a, b); }
- Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (a, b); }
- Vc_INTRINSIC __m256 cmpge(__m256 a, __m256 b, float) { return AVX::cmpge_ps(a, b); }
- Vc_INTRINSIC __m256d cmpge(__m256d a, __m256d b, double) { return AVX::cmpge_pd(a, b); }
- Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(b, a)); }
- Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(b, a)); }
- Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(b, a)); }
- Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(b, a)); }
- Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (b, a)); }
- Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (b, a)); }
- Vc_INTRINSIC __m256 cmple(__m256 a, __m256 b, float) { return AVX::cmple_ps(a, b); }
- Vc_INTRINSIC __m256d cmple(__m256d a, __m256d b, double) { return AVX::cmple_pd(a, b); }
- Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(a, b)); }
- Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(a, b)); }
- Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(a, b)); }
- Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(a, b)); }
- Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (a, b)); }
- Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (a, b)); }
- Vc_INTRINSIC __m256 cmplt(__m256 a, __m256 b, float) { return AVX::cmplt_ps(a, b); }
- Vc_INTRINSIC __m256d cmplt(__m256d a, __m256d b, double) { return AVX::cmplt_pd(a, b); }
- Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(b, a); }
- Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(b, a); }
- Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(b, a); }
- Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(b, a); }
- Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (b, a); }
- Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (b, a); }
- Vc_INTRINSIC __m256 fma(__m256 a, __m256 b, __m256 c, float) {
- #ifdef Vc_IMPL_FMA4
- return _mm256_macc_ps(a, b, c);
- #elif defined Vc_IMPL_FMA
- return _mm256_fmadd_ps(a, b, c);
- #else
- using namespace AVX;
- __m256d v1_0 = _mm256_cvtps_pd(lo128(a));
- __m256d v1_1 = _mm256_cvtps_pd(hi128(a));
- __m256d v2_0 = _mm256_cvtps_pd(lo128(b));
- __m256d v2_1 = _mm256_cvtps_pd(hi128(b));
- __m256d v3_0 = _mm256_cvtps_pd(lo128(c));
- __m256d v3_1 = _mm256_cvtps_pd(hi128(c));
- return concat(_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
- _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
- #endif
- }
- Vc_INTRINSIC __m256d fma(__m256d a, __m256d b, __m256d c, double)
- {
- #ifdef Vc_IMPL_FMA4
- return _mm256_macc_pd(a, b, c);
- #elif defined Vc_IMPL_FMA
- return _mm256_fmadd_pd(a, b, c);
- #else
- using namespace AVX;
- __m256d h1 = and_(a, _mm256_broadcast_sd(reinterpret_cast<const double *>(
- &c_general::highMaskDouble)));
- __m256d h2 = and_(b, _mm256_broadcast_sd(reinterpret_cast<const double *>(
- &c_general::highMaskDouble)));
- const __m256d l1 = _mm256_sub_pd(a, h1);
- const __m256d l2 = _mm256_sub_pd(b, h2);
- const __m256d ll = mul(l1, l2, double());
- const __m256d lh = add(mul(l1, h2, double()), mul(h1, l2, double()), double());
- const __m256d hh = mul(h1, h2, double());
- const __m256d lh_lt_v3 = cmplt(abs(lh, double()), abs(c, double()), double());
- const __m256d x = _mm256_blendv_pd(c, lh, lh_lt_v3);
- const __m256d y = _mm256_blendv_pd(lh, c, lh_lt_v3);
- return add(add(ll, x, double()), add(y, hh, double()), double());
- #endif
- }
- template <typename T> Vc_INTRINSIC __m256i fma(__m256i a, __m256i b, __m256i c, T)
- {
- return add(mul(a, b, T()), c, T());
- }
- template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, int) { return AVX::srai_epi32<shift>(a); }
- template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, uint) { return AVX::srli_epi32<shift>(a); }
- template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, short) { return AVX::srai_epi16<shift>(a); }
- template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, ushort) { return AVX::srli_epi16<shift>(a); }
- Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, int) { return AVX::sra_epi32(a, _mm_cvtsi32_si128(shift)); }
- Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, uint) { return AVX::srl_epi32(a, _mm_cvtsi32_si128(shift)); }
- Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, short) { return AVX::sra_epi16(a, _mm_cvtsi32_si128(shift)); }
- Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, ushort) { return AVX::srl_epi16(a, _mm_cvtsi32_si128(shift)); }
- template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, int) { return AVX::slli_epi32<shift>(a); }
- template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, uint) { return AVX::slli_epi32<shift>(a); }
- template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, short) { return AVX::slli_epi16<shift>(a); }
- template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, ushort) { return AVX::slli_epi16<shift>(a); }
- Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, int) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
- Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, uint) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
- Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, short) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
- Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, ushort) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
- Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m256 x) { return x; }
- Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m256d x) { return x; }
- Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m256i x) { return x; }
- Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m128 x) { return AVX::zeroExtend(x); }
- Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m128d x) { return AVX::zeroExtend(x); }
- Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m128i x) { return AVX::zeroExtend(x); }
- Vc_INTRINSIC __m256 avx_broadcast( float x) { return _mm256_set1_ps(x); }
- Vc_INTRINSIC __m256d avx_broadcast(double x) { return _mm256_set1_pd(x); }
- Vc_INTRINSIC __m256i avx_broadcast( int x) { return _mm256_set1_epi32(x); }
- Vc_INTRINSIC __m256i avx_broadcast( uint x) { return _mm256_set1_epi32(x); }
- Vc_INTRINSIC __m256i avx_broadcast( short x) { return _mm256_set1_epi16(x); }
- Vc_INTRINSIC __m256i avx_broadcast(ushort x) { return _mm256_set1_epi16(x); }
- Vc_INTRINSIC __m256i avx_broadcast( char x) { return _mm256_set1_epi8(x); }
- Vc_INTRINSIC __m256i avx_broadcast( schar x) { return _mm256_set1_epi8(x); }
- Vc_INTRINSIC __m256i avx_broadcast( uchar x) { return _mm256_set1_epi8(x); }
- template <Vc::Implementation Impl, typename T,
- typename = enable_if<(Impl >= AVXImpl && Impl <= AVX2Impl)>>
- Vc_CONST_L AVX2::Vector<T> sorted(AVX2::Vector<T> x) Vc_CONST_R;
- template <typename T> Vc_INTRINSIC Vc_CONST AVX2::Vector<T> sorted(AVX2::Vector<T> x)
- {
- return sorted<CurrentImplementation::current()>(x);
- }
- template <typename T, typename V>
- static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32), V> shifted(V v, int amount)
- {
- using namespace AVX;
- constexpr int S = sizeof(T);
- switch (amount) {
- case 0: return v;
- case 1: return shifted<sanitize<V>( 1 * S)>(v);
- case 2: return shifted<sanitize<V>( 2 * S)>(v);
- case 3: return shifted<sanitize<V>( 3 * S)>(v);
- case -1: return shifted<sanitize<V>(-1 * S)>(v);
- case -2: return shifted<sanitize<V>(-2 * S)>(v);
- case -3: return shifted<sanitize<V>(-3 * S)>(v);
- }
- if (sizeof(T) <= 4) {
- switch (amount) {
- case 4: return shifted<sanitize<V>( 4 * S)>(v);
- case 5: return shifted<sanitize<V>( 5 * S)>(v);
- case 6: return shifted<sanitize<V>( 6 * S)>(v);
- case 7: return shifted<sanitize<V>( 7 * S)>(v);
- case -4: return shifted<sanitize<V>(-4 * S)>(v);
- case -5: return shifted<sanitize<V>(-5 * S)>(v);
- case -6: return shifted<sanitize<V>(-6 * S)>(v);
- case -7: return shifted<sanitize<V>(-7 * S)>(v);
- }
- if (sizeof(T) <= 2) {
- switch (amount) {
- case 8: return shifted<sanitize<V>( 8 * S)>(v);
- case 9: return shifted<sanitize<V>( 9 * S)>(v);
- case 10: return shifted<sanitize<V>( 10 * S)>(v);
- case 11: return shifted<sanitize<V>( 11 * S)>(v);
- case 12: return shifted<sanitize<V>( 12 * S)>(v);
- case 13: return shifted<sanitize<V>( 13 * S)>(v);
- case 14: return shifted<sanitize<V>( 14 * S)>(v);
- case 15: return shifted<sanitize<V>( 15 * S)>(v);
- case -8: return shifted<sanitize<V>(- 8 * S)>(v);
- case -9: return shifted<sanitize<V>(- 9 * S)>(v);
- case -10: return shifted<sanitize<V>(-10 * S)>(v);
- case -11: return shifted<sanitize<V>(-11 * S)>(v);
- case -12: return shifted<sanitize<V>(-12 * S)>(v);
- case -13: return shifted<sanitize<V>(-13 * S)>(v);
- case -14: return shifted<sanitize<V>(-14 * S)>(v);
- case -15: return shifted<sanitize<V>(-15 * S)>(v);
- }
- if (sizeof(T) == 1) {
- switch (amount) {
- case 16: return shifted<sanitize<V>( 16)>(v);
- case 17: return shifted<sanitize<V>( 17)>(v);
- case 18: return shifted<sanitize<V>( 18)>(v);
- case 19: return shifted<sanitize<V>( 19)>(v);
- case 20: return shifted<sanitize<V>( 20)>(v);
- case 21: return shifted<sanitize<V>( 21)>(v);
- case 22: return shifted<sanitize<V>( 22)>(v);
- case 23: return shifted<sanitize<V>( 23)>(v);
- case 24: return shifted<sanitize<V>( 24)>(v);
- case 25: return shifted<sanitize<V>( 25)>(v);
- case 26: return shifted<sanitize<V>( 26)>(v);
- case 27: return shifted<sanitize<V>( 27)>(v);
- case 28: return shifted<sanitize<V>( 28)>(v);
- case 29: return shifted<sanitize<V>( 29)>(v);
- case 30: return shifted<sanitize<V>( 30)>(v);
- case 31: return shifted<sanitize<V>( 31)>(v);
- case -16: return shifted<sanitize<V>(-16)>(v);
- case -17: return shifted<sanitize<V>(-17)>(v);
- case -18: return shifted<sanitize<V>(-18)>(v);
- case -19: return shifted<sanitize<V>(-19)>(v);
- case -20: return shifted<sanitize<V>(-20)>(v);
- case -21: return shifted<sanitize<V>(-21)>(v);
- case -22: return shifted<sanitize<V>(-22)>(v);
- case -23: return shifted<sanitize<V>(-23)>(v);
- case -24: return shifted<sanitize<V>(-24)>(v);
- case -25: return shifted<sanitize<V>(-25)>(v);
- case -26: return shifted<sanitize<V>(-26)>(v);
- case -27: return shifted<sanitize<V>(-27)>(v);
- case -28: return shifted<sanitize<V>(-28)>(v);
- case -29: return shifted<sanitize<V>(-29)>(v);
- case -30: return shifted<sanitize<V>(-30)>(v);
- case -31: return shifted<sanitize<V>(-31)>(v);
- }
- }
- }
- }
- return avx_cast<V>(_mm256_setzero_ps());
- }
- template <typename T, typename V>
- static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> shifted(V v, int amount)
- {
- using namespace AVX;
- switch (amount) {
- case 0: return v;
- case 1: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
- case 2: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
- case 3: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
- case -1: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
- case -2: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
- case -3: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
- }
- if (sizeof(T) <= 2) {
- switch (amount) {
- case 4: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
- case 5: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
- case 6: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
- case 7: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
- case -4: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
- case -5: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
- case -6: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
- case -7: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
- }
- }
- return avx_cast<V>(_mm_setzero_ps());
- }
- template <typename T, size_t N, typename V>
- static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 4), V> rotated(V v,
- int amount)
- {
- using namespace AVX;
- const __m128i vLo = avx_cast<__m128i>(lo128(v));
- const __m128i vHi = avx_cast<__m128i>(hi128(v));
- switch (static_cast<unsigned int>(amount) % N) {
- case 0:
- return v;
- case 1:
- return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vHi, vLo),
- SSE::alignr_epi8<sizeof(T)>(vLo, vHi)));
- case 2:
- return Mem::permute128<X1, X0>(v);
- case 3:
- return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vLo, vHi),
- SSE::alignr_epi8<sizeof(T)>(vHi, vLo)));
- }
- return avx_cast<V>(_mm256_setzero_ps());
- }
- template <typename T, size_t N, typename V>
- static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 8), V> rotated(V v,
- int amount)
- {
- using namespace AVX;
- const __m128i vLo = avx_cast<__m128i>(lo128(v));
- const __m128i vHi = avx_cast<__m128i>(hi128(v));
- switch (static_cast<unsigned int>(amount) % N) {
- case 0:
- return v;
- case 1:
- return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
- SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
- case 2:
- return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
- SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
- case 3:
- return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
- SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
- case 4:
- return Mem::permute128<X1, X0>(v);
- case 5:
- return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
- SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
- case 6:
- return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
- SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
- case 7:
- return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
- SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
- }
- return avx_cast<V>(_mm256_setzero_ps());
- }
- #ifdef Vc_IMPL_AVX2
- template <typename T, size_t N, typename V>
- static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 16), V> rotated(
- V v, int amount)
- {
- using namespace AVX;
- const __m128i vLo = avx_cast<__m128i>(lo128(v));
- const __m128i vHi = avx_cast<__m128i>(hi128(v));
- switch (static_cast<unsigned int>(amount) % N) {
- case 0:
- return v;
- case 1:
- return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
- SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
- case 2:
- return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
- SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
- case 3:
- return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
- SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
- case 4:
- return Mem::permute4x64<X1, X2, X3, X0>(v);
- case 5:
- return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo),
- SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi)));
- case 6:
- return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo),
- SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi)));
- case 7:
- return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo),
- SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi)));
- case 8:
- return Mem::permute128<X1, X0>(v);
- case 9:
- return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
- SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
- case 10:
- return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
- SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
- case 11:
- return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
- SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
- case 12:
- return Mem::permute4x64<X3, X0, X1, X2>(v);
- case 13:
- return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi),
- SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo)));
- case 14:
- return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi),
- SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo)));
- case 15:
- return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi),
- SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo)));
- }
- return avx_cast<V>(_mm256_setzero_ps());
- }
- #endif
- Vc_INTRINSIC Vc_CONST int testc(__m128 a, __m128 b) { return _mm_testc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
- Vc_INTRINSIC Vc_CONST int testc(__m256 a, __m256 b) { return _mm256_testc_ps(a, b); }
- Vc_INTRINSIC Vc_CONST int testc(__m256d a, __m256d b) { return _mm256_testc_pd(a, b); }
- Vc_INTRINSIC Vc_CONST int testc(__m256i a, __m256i b) { return _mm256_testc_si256(a, b); }
- Vc_INTRINSIC Vc_CONST int testz(__m128 a, __m128 b) { return _mm_testz_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
- Vc_INTRINSIC Vc_CONST int testz(__m256 a, __m256 b) { return _mm256_testz_ps(a, b); }
- Vc_INTRINSIC Vc_CONST int testz(__m256d a, __m256d b) { return _mm256_testz_pd(a, b); }
- Vc_INTRINSIC Vc_CONST int testz(__m256i a, __m256i b) { return _mm256_testz_si256(a, b); }
- Vc_INTRINSIC Vc_CONST int testnzc(__m128 a, __m128 b) { return _mm_testnzc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
- Vc_INTRINSIC Vc_CONST int testnzc(__m256 a, __m256 b) { return _mm256_testnzc_ps(a, b); }
- Vc_INTRINSIC Vc_CONST int testnzc(__m256d a, __m256d b) { return _mm256_testnzc_pd(a, b); }
- Vc_INTRINSIC Vc_CONST int testnzc(__m256i a, __m256i b) { return _mm256_testnzc_si256(a, b); }
- Vc_INTRINSIC Vc_CONST int movemask(__m256i a) { return AVX::movemask_epi8(a); }
- Vc_INTRINSIC Vc_CONST int movemask(__m128i a) { return _mm_movemask_epi8(a); }
- Vc_INTRINSIC Vc_CONST int movemask(__m256d a) { return _mm256_movemask_pd(a); }
- Vc_INTRINSIC Vc_CONST int movemask(__m128d a) { return _mm_movemask_pd(a); }
- Vc_INTRINSIC Vc_CONST int movemask(__m256 a) { return _mm256_movemask_ps(a); }
- Vc_INTRINSIC Vc_CONST int movemask(__m128 a) { return _mm_movemask_ps(a); }
- template <size_t N, typename Flags>
- Vc_INTRINSIC void mask_store(__m256i k, bool *mem, Flags)
- {
- static_assert(
- N == 4 || N == 8 || N == 16,
- "mask_store(__m256i, bool *) is only implemented for 4, 8, and 16 entries");
- switch (N) {
- case 4:
- *aliasing_cast<int32_t>(mem) = (_mm_movemask_epi8(AVX::lo128(k)) |
- (_mm_movemask_epi8(AVX::hi128(k)) << 16)) &
- 0x01010101;
- break;
- case 8: {
- const auto k2 = _mm_srli_epi16(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)), 15);
- const auto k3 = _mm_packs_epi16(k2, _mm_setzero_si128());
- #ifdef __x86_64__
- *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k3);
- #else
- *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(k3);
- *aliasing_cast<int32_t>(mem + 4) = _mm_extract_epi32(k3, 1);
- #endif
- } break;
- case 16: {
- const auto bools = Detail::and_(_mm_set1_epi8(1),
- _mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
- if (Flags::IsAligned) {
- _mm_store_si128(reinterpret_cast<__m128i *>(mem), bools);
- } else {
- _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), bools);
- }
- } break;
- default:
- Vc_UNREACHABLE();
- }
- }
- template <typename R, size_t N, typename Flags>
- Vc_INTRINSIC R mask_load(const bool *mem, Flags,
- enable_if<std::is_same<R, __m128>::value> = nullarg)
- {
- static_assert(N == 4 || N == 8,
- "mask_load<__m128>(const bool *) is only implemented for 4, 8 entries");
- switch (N) {
- case 4: {
- __m128i k = _mm_cvtsi32_si128(*aliasing_cast<int32_t>(mem));
- k = _mm_unpacklo_epi8(k, k);
- k = _mm_unpacklo_epi16(k, k);
- k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
- return AVX::avx_cast<__m128>(k);
- }
- case 8: {
- #ifdef __x86_64__
- __m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
- #else
- __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
- #endif
- return AVX::avx_cast<__m128>(
- _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
- }
- default:
- Vc_UNREACHABLE();
- }
- }
- template <typename R, size_t N, typename Flags>
- Vc_INTRINSIC R mask_load(const bool *mem, Flags,
- enable_if<std::is_same<R, __m256>::value> = nullarg)
- {
- static_assert(
- N == 4 || N == 8 || N == 16,
- "mask_load<__m256>(const bool *) is only implemented for 4, 8, and 16 entries");
- switch (N) {
- case 4: {
- __m128i k = AVX::avx_cast<__m128i>(_mm_and_ps(
- _mm_set1_ps(*aliasing_cast<float>(mem)),
- AVX::avx_cast<__m128>(_mm_setr_epi32(0x1, 0x100, 0x10000, 0x1000000))));
- k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
- return AVX::avx_cast<__m256>(
- AVX::concat(_mm_unpacklo_epi32(k, k), _mm_unpackhi_epi32(k, k)));
- }
- case 8: {
- #ifdef __x86_64__
- __m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
- #else
- __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
- #endif
- k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
- return AVX::avx_cast<__m256>(
- AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k)));
- }
- case 16: {
- const auto k128 = _mm_cmpgt_epi8(
- Flags::IsAligned ? _mm_load_si128(reinterpret_cast<const __m128i *>(mem))
- : _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem)),
- _mm_setzero_si128());
- return AVX::avx_cast<__m256>(
- AVX::concat(_mm_unpacklo_epi8(k128, k128), _mm_unpackhi_epi8(k128, k128)));
- }
- default:
- Vc_UNREACHABLE();
- return R();
- }
- }
- template <size_t Size>
- Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m256i x) Vc_INTRINSIC_R Vc_CONST_R;
- template <> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m256i k)
- {
- return movemask(AVX::avx_cast<__m256d>(k));
- }
- template <> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m256i k)
- {
- return movemask(AVX::avx_cast<__m256>(k));
- }
- #ifdef Vc_IMPL_BMI2
- template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k)
- {
- return _pext_u32(movemask(k), 0x55555555u);
- }
- #endif
- template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k)
- {
- return movemask(k);
- }
- template<typename V> struct InterleaveImpl<V, 16, 32> {
- template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0,
- const typename V::AsArg v1)
- {
- const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data());
- const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data());
- using namespace AVX;
- *aliasing_cast<uint32_t>(&data[i[ 0]]) = _mm_cvtsi128_si32(lo128(tmp0));
- *aliasing_cast<uint32_t>(&data[i[ 1]]) = _mm_extract_epi32(lo128(tmp0), 1);
- *aliasing_cast<uint32_t>(&data[i[ 2]]) = _mm_extract_epi32(lo128(tmp0), 2);
- *aliasing_cast<uint32_t>(&data[i[ 3]]) = _mm_extract_epi32(lo128(tmp0), 3);
- *aliasing_cast<uint32_t>(&data[i[ 4]]) = _mm_cvtsi128_si32(lo128(tmp1));
- *aliasing_cast<uint32_t>(&data[i[ 5]]) = _mm_extract_epi32(lo128(tmp1), 1);
- *aliasing_cast<uint32_t>(&data[i[ 6]]) = _mm_extract_epi32(lo128(tmp1), 2);
- *aliasing_cast<uint32_t>(&data[i[ 7]]) = _mm_extract_epi32(lo128(tmp1), 3);
- *aliasing_cast<uint32_t>(&data[i[ 8]]) = _mm_cvtsi128_si32(hi128(tmp0));
- *aliasing_cast<uint32_t>(&data[i[ 9]]) = _mm_extract_epi32(hi128(tmp0), 1);
- *aliasing_cast<uint32_t>(&data[i[10]]) = _mm_extract_epi32(hi128(tmp0), 2);
- *aliasing_cast<uint32_t>(&data[i[11]]) = _mm_extract_epi32(hi128(tmp0), 3);
- *aliasing_cast<uint32_t>(&data[i[12]]) = _mm_cvtsi128_si32(hi128(tmp1));
- *aliasing_cast<uint32_t>(&data[i[13]]) = _mm_extract_epi32(hi128(tmp1), 1);
- *aliasing_cast<uint32_t>(&data[i[14]]) = _mm_extract_epi32(hi128(tmp1), 2);
- *aliasing_cast<uint32_t>(&data[i[15]]) = _mm_extract_epi32(hi128(tmp1), 3);
- }
- static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
- const typename V::AsArg v0, const typename V::AsArg v1)
- {
- const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data());
- const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data());
- V(Mem::shuffle128<X0, Y0>(tmp0, tmp1)).store(&data[i[0]], Vc::Unaligned);
- V(Mem::shuffle128<X1, Y1>(tmp0, tmp1)).store(&data[i[8]], Vc::Unaligned);
- }
- template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
- {
- interleave(data, i, v0, v1);
- v2.scatter(data + 2, i);
- }
- template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3)
- {
- const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data());
- const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data());
- const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data());
- const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data());
- const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
- const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
- const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
- const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
- using namespace AVX;
- auto &&store = [&](__m256i x, int offset) {
- _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 0]]), lo128(x));
- _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 8]]), hi128(x));
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 1]]), avx_cast<__m128>(x));
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 9]]), avx_cast<__m128>(hi128(x)));
- };
- store(tmp4, 0);
- store(tmp5, 2);
- store(tmp6, 4);
- store(tmp7, 6);
- }
- static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3)
- {
- const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data());
- const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data());
- const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data());
- const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data());
- const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
- const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
- const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
- const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
- V(Mem::shuffle128<X0, Y0>(tmp4, tmp5)).store(&data[i[0]], ::Vc::Unaligned);
- V(Mem::shuffle128<X0, Y0>(tmp6, tmp7)).store(&data[i[4]], ::Vc::Unaligned);
- V(Mem::shuffle128<X1, Y1>(tmp4, tmp5)).store(&data[i[8]], ::Vc::Unaligned);
- V(Mem::shuffle128<X1, Y1>(tmp6, tmp7)).store(&data[i[12]], ::Vc::Unaligned);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4)
- {
- interleave(data, i, v0, v1, v2, v3);
- v4.scatter(data + 4, i);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6, const typename V::AsArg v7)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6, v7);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1)
- {
- const __m256i tmp4 =
- _mm256_setr_epi32(
- *aliasing_cast<int>(&data[i[0]]), *aliasing_cast<int>(&data[i[1]]),
- *aliasing_cast<int>(&data[i[2]]), *aliasing_cast<int>(&data[i[3]]),
- *aliasing_cast<int>(&data[i[8]]), *aliasing_cast<int>(&data[i[9]]),
- *aliasing_cast<int>(&data[i[10]]), *aliasing_cast<int>(&data[i[11]]));
- const __m256i tmp5 =
- _mm256_setr_epi32(
- *aliasing_cast<int>(&data[i[4]]), *aliasing_cast<int>(&data[i[5]]),
- *aliasing_cast<int>(&data[i[6]]), *aliasing_cast<int>(&data[i[7]]),
- *aliasing_cast<int>(&data[i[12]]), *aliasing_cast<int>(&data[i[13]]),
- *aliasing_cast<int>(&data[i[14]]), *aliasing_cast<int>(&data[i[15]]));
- const __m256i tmp2 = AVX::unpacklo_epi16(tmp4, tmp5);
- const __m256i tmp3 = AVX::unpackhi_epi16(tmp4, tmp5);
- const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
- const __m256i tmp1 = AVX::unpackhi_epi16(tmp2, tmp3);
- v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
- v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2)
- {
- using namespace AVX;
- const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
- *aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
- *aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
- const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
- *aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
- *aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
- const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
- *aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
- *aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
- const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
- *aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
- *aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
- const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
- const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
- const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
- const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
- const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6);
- const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6);
- const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7);
- const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7);
- v0.data() = AVX::unpacklo_epi16(tmp8, tmp10);
- v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
- v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3)
- {
- using namespace AVX;
- const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
- *aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
- *aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
- const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
- *aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
- *aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
- const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
- *aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
- *aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
- const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
- *aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
- *aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
- const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
- const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
- const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
- const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
- const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6);
- const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6);
- const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7);
- const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7);
- v0.data() = AVX::unpacklo_epi16(tmp8, tmp10);
- v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
- v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
- v3.data() = AVX::unpackhi_epi16(tmp9, tmp11);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
- {
- using namespace AVX;
- const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
- const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
- const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
- const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
- const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
- const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
- const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
- const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
- const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
- const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
- const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
- const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
- const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
- const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
- const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
- const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
- const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
- const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
- const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
- const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
- const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
- const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
- v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
- v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
- v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
- v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
- v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
- {
- using namespace AVX;
- const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
- const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
- const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
- const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
- const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
- const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
- const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
- const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
- const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
- const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
- const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
- const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
- const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
- const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
- const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
- const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
- const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
- const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
- const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
- const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
- const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
- const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
- v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
- v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
- v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
- v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
- v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
- v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
- {
- using namespace AVX;
- const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
- const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
- const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
- const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
- const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
- const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
- const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
- const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
- const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
- const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
- const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
- const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
- const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
- const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
- const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
- const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
- const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
- const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
- const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
- const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
- const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
- const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
- const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11);
- const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13);
- v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
- v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
- v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
- v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
- v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
- v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
- v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
- {
- using namespace AVX;
- const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
- const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
- const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
- const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
- const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
- const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
- const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
- const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
- _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
- const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
- const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
- const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
- const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
- const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
- const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
- const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
- const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
- const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
- const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
- const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
- const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
- const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
- const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
- const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11);
- const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13);
- v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
- v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
- v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
- v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
- v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
- v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
- v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
- v7.data() = AVX::unpackhi_epi16(tmp14, tmp15);
- }
- };
- template<typename V> struct InterleaveImpl<V, 8, 32> {
- static_assert(sizeof(typename V::value_type) == 4, "");
- template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1)
- {
- using namespace AVX;
- const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
- const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
- _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0));
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0));
- _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1));
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1));
- _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0));
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0));
- _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1));
- _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1));
- }
- static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
- const typename V::AsArg v0, const typename V::AsArg v1)
- {
- using namespace AVX;
- const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
- const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(tmp0));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(tmp1));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(tmp0));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(tmp1));
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2)
- {
- using namespace AVX;
- #ifdef Vc_USE_MASKMOV_SCATTER
- const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
- const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
- const m256 tmp2 = _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
- const m256 tmp3 = _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
- const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2);
- const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2);
- const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3);
- const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3);
- const m128i mask = _mm_set_epi32(0, -1, -1, -1);
- _mm_maskstore_ps(aliasing_cast<float>(&data[i[0]]), mask, lo128(tmp4));
- _mm_maskstore_ps(aliasing_cast<float>(&data[i[1]]), mask, lo128(tmp5));
- _mm_maskstore_ps(aliasing_cast<float>(&data[i[2]]), mask, lo128(tmp6));
- _mm_maskstore_ps(aliasing_cast<float>(&data[i[3]]), mask, lo128(tmp7));
- _mm_maskstore_ps(aliasing_cast<float>(&data[i[4]]), mask, hi128(tmp4));
- _mm_maskstore_ps(aliasing_cast<float>(&data[i[5]]), mask, hi128(tmp5));
- _mm_maskstore_ps(aliasing_cast<float>(&data[i[6]]), mask, hi128(tmp6));
- _mm_maskstore_ps(aliasing_cast<float>(&data[i[7]]), mask, hi128(tmp7));
- #else
- interleave(data, i, v0, v1);
- v2.scatter(data + 2, i);
- #endif
- }
- static inline void interleave(typename V::EntryType *const data,
- const Common::SuccessiveEntries<3> &i,
- const typename V::AsArg v0_,
- const typename V::AsArg v1_,
- const typename V::AsArg v2_)
- {
- __m256 v0 = AVX::avx_cast<__m256>(v0_.data());
- __m256 v1 = AVX::avx_cast<__m256>(v1_.data());
- __m256 v2 = AVX::avx_cast<__m256>(v2_.data());
- v0 = _mm256_shuffle_ps(v0, v0, 0x6c);
- v1 = _mm256_shuffle_ps(v1, v1, 0xb1);
- v2 = _mm256_shuffle_ps(v2, v2, 0xc6);
- __m256 w0 = Mem::blend<X0, X1, Y2, X3, Y4, X5, X6, Y7>(
- Mem::blend<X0, Y1, X2, X3, X4, X5, Y6, X7>(v0, v1), v2);
- __m256 w1 = Mem::blend<X0, Y1, X2, X3, X4, Y5, X6, X7>(
- Mem::blend<Y0, X1, X2, Y3, Y4, X5, X6, Y7>(v0, v1), v2);
- __m256 w2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(
- Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(v0, v1), v2);
- _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
- _mm256_permute2f128_ps(w0, w1, 0x20));
- _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8), w2);
- _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
- _mm256_permute2f128_ps(w1, w0, 0x31));
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3)
- {
- using namespace AVX;
- const __m256 tmp0 =
- _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
- const __m256 tmp1 =
- _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
- const __m256 tmp2 =
- _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
- const __m256 tmp3 =
- _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
- const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
- const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
- const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
- const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
- _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(_04));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), lo128(_15));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(_26));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), lo128(_37));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(_04));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[5]]), hi128(_15));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(_26));
- _mm_storeu_ps(aliasing_cast<float>(&data[i[7]]), hi128(_37));
- }
- static inline void interleave(typename V::EntryType *const data,
- const Common::SuccessiveEntries<4> &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3)
- {
- using namespace AVX;
- const __m256 tmp0 =
- _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
- const __m256 tmp1 =
- _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
- const __m256 tmp2 =
- _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
- const __m256 tmp3 =
- _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
- const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
- const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
- const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
- const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
- _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
- _mm256_permute2f128_ps(_04, _15, 0x20));
- _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8),
- _mm256_permute2f128_ps(_26, _37, 0x20));
- _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
- _mm256_permute2f128_ps(_04, _15, 0x31));
- _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 24),
- _mm256_permute2f128_ps(_26, _37, 0x31));
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4)
- {
- interleave(data, i, v0, v1, v2, v3);
- v4.scatter(data + 4, i);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6, const typename V::AsArg v7)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6, v7);
- }
- template <typename I>
- static inline void deinterleave(typename V::EntryType const *const data, const I &i,
- V &v0, V &v1)
- {
- using namespace AVX;
- const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[0]]));
- const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[2]]));
- const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[4]]));
- const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[6]]));
- const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&data[i[1]]));
- const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&data[i[3]]));
- const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&data[i[5]]));
- const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&data[i[7]]));
- const m256 tmp2 = concat(il01, il45);
- const m256 tmp3 = concat(il23, il67);
- const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
- const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
- v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
- v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
- }
- static inline void deinterleave(typename V::EntryType const *const data,
- const Common::SuccessiveEntries<2> &i, V &v0, V &v1)
- {
- using namespace AVX;
- const m256 il0123 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
- const m256 il4567 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[4]]));
- const m256 tmp2 = Mem::shuffle128<X0, Y0>(il0123, il4567);
- const m256 tmp3 = Mem::shuffle128<X1, Y1>(il0123, il4567);
- const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
- const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
- v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
- v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
- }
- template <typename I>
- static inline void deinterleave(typename V::EntryType const *const data, const I &i,
- V &v0, V &v1, V &v2)
- {
- using namespace AVX;
- const m128 il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
- const m128 il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
- const m128 il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
- const m128 il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
- const m128 il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]]));
- const m128 il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]]));
- const m128 il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]]));
- const m128 il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]]));
- const m256 il04 = concat(il0, il4);
- const m256 il15 = concat(il1, il5);
- const m256 il26 = concat(il2, il6);
- const m256 il37 = concat(il3, il7);
- const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
- const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
- const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
- const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
- v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
- v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
- v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
- }
- static inline void deinterleave(typename V::EntryType const *const data,
- const Common::SuccessiveEntries<3> &i, V &v0, V &v1,
- V &v2)
- {
- __m256 in0 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 0));
- __m256 in1 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 8));
- __m256 in2 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 16));
- const __m256 aaabffgg = _mm256_permute2f128_ps(in0, in2, 0x20);
- const __m256 cdddeeef = in1;
- const __m256 bbccghhh = _mm256_permute2f128_ps(in0, in2, 0x31);
- const __m256 x0 = _mm256_blend_ps(
- _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80),
- bbccghhh, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0);
- const __m256 x1 = _mm256_blend_ps(
- _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0),
- bbccghhh, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0);
- const __m256 x2 = _mm256_blend_ps(
- _mm256_blend_ps(aaabffgg, cdddeeef, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0),
- bbccghhh, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80);
- v0 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x0, x0, 0x6c));
- v1 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x1, x1, 0xb1));
- v2 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x2, x2, 0xc6));
- }
- template <typename I>
- static inline void deinterleave(typename V::EntryType const *const data, const I &i,
- V &v0, V &v1, V &v2, V &v3)
- {
- using namespace AVX;
- const m128 il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
- const m128 il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
- const m128 il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
- const m128 il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
- const m128 il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]]));
- const m128 il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]]));
- const m128 il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]]));
- const m128 il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]]));
- const m256 il04 = concat(il0, il4);
- const m256 il15 = concat(il1, il5);
- const m256 il26 = concat(il2, il6);
- const m256 il37 = concat(il3, il7);
- const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
- const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
- const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
- const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
- v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
- v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
- v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
- v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
- }
- static inline void deinterleave(typename V::EntryType const *const data,
- const Common::SuccessiveEntries<4> &i, V &v0, V &v1,
- V &v2, V &v3)
- {
- using namespace AVX;
- const __m256 il01 = _mm256_loadu_ps(
- aliasing_cast<float>(&data[i[0]]));
- const __m256 il23 = _mm256_loadu_ps(
- aliasing_cast<float>(&data[i[2]]));
- const __m256 il45 = _mm256_loadu_ps(
- aliasing_cast<float>(&data[i[4]]));
- const __m256 il67 = _mm256_loadu_ps(
- aliasing_cast<float>(&data[i[6]]));
- const __m256 il04 = _mm256_permute2f128_ps(il01, il45, 0x20);
- const __m256 il15 = _mm256_permute2f128_ps(il01, il45, 0x31);
- const __m256 il26 = _mm256_permute2f128_ps(il23, il67, 0x20);
- const __m256 il37 = _mm256_permute2f128_ps(il23, il67, 0x31);
- const __m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
- const __m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
- const __m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
- const __m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
- v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
- v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
- v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
- v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
- {
- v4.gather(data + 4, i);
- deinterleave(data, i, v0, v1, v2, v3);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- deinterleave(data + 4, i, v4, v5);
- }
- static inline void deinterleave(typename V::EntryType const *const data,
- const Common::SuccessiveEntries<6> &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
- {
- using namespace AVX;
- const m256 a = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
- const m256 b = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 1 * V::Size]));
- const m256 c = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 2 * V::Size]));
- const m256 d = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 3 * V::Size]));
- const m256 e = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 4 * V::Size]));
- const m256 f = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 5 * V::Size]));
- const __m256 tmp2 = Mem::shuffle128<X0, Y0>(a, d);
- const __m256 tmp3 = Mem::shuffle128<X1, Y1>(b, e);
- const __m256 tmp4 = Mem::shuffle128<X1, Y1>(a, d);
- const __m256 tmp5 = Mem::shuffle128<X0, Y0>(c, f);
- const __m256 tmp8 = Mem::shuffle128<X0, Y0>(b, e);
- const __m256 tmp9 = Mem::shuffle128<X1, Y1>(c, f);
- const __m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
- const __m256 tmp1 = _mm256_unpackhi_ps(tmp4, tmp5);
- const __m256 tmp6 = _mm256_unpackhi_ps(tmp2, tmp3);
- const __m256 tmp7 = _mm256_unpacklo_ps(tmp8, tmp9);
- const __m256 tmp10 = _mm256_unpacklo_ps(tmp4, tmp5);
- const __m256 tmp11 = _mm256_unpackhi_ps(tmp8, tmp9);
- v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
- v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
- v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp6, tmp7));
- v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp6, tmp7));
- v4.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp10, tmp11));
- v5.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp10, tmp11));
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- deinterleave(data + 4, i, v4, v5, v6);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
- {
- deinterleave(data, i, v0, v1, v2, v3);
- deinterleave(data + 4, i, v4, v5, v6, v7);
- }
- };
- template<typename V> struct InterleaveImpl<V, 4, 32> {
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1)
- {
- using namespace AVX;
- const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
- const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
- _mm_storeu_pd(&data[i[0]], lo128(tmp0));
- _mm_storeu_pd(&data[i[1]], lo128(tmp1));
- _mm_storeu_pd(&data[i[2]], hi128(tmp0));
- _mm_storeu_pd(&data[i[3]], hi128(tmp1));
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2)
- {
- using namespace AVX;
- #ifdef Vc_USE_MASKMOV_SCATTER
- const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
- const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
- const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data());
- const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data());
- #if defined(Vc_MSVC) && (Vc_MSVC < 170000000 || !defined(_WIN64))
- const m256i mask = concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1));
- #else
- const m256i mask = _mm256_set_epi64x(0, -1, -1, -1);
- #endif
- _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128<X0, Y0>(tmp0, tmp2));
- _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128<X0, Y0>(tmp1, tmp3));
- _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128<X1, Y1>(tmp0, tmp2));
- _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128<X1, Y1>(tmp1, tmp3));
- #else
- interleave(data, i, v0, v1);
- v2.scatter(data + 2, i);
- #endif
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3)
- {
- using namespace AVX;
- const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
- const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
- const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data());
- const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data());
- _mm_storeu_pd(&data[i[0] ], lo128(tmp0));
- _mm_storeu_pd(&data[i[0]+2], lo128(tmp2));
- _mm_storeu_pd(&data[i[1] ], lo128(tmp1));
- _mm_storeu_pd(&data[i[1]+2], lo128(tmp3));
- _mm_storeu_pd(&data[i[2] ], hi128(tmp0));
- _mm_storeu_pd(&data[i[2]+2], hi128(tmp2));
- _mm_storeu_pd(&data[i[3] ], hi128(tmp1));
- _mm_storeu_pd(&data[i[3]+2], hi128(tmp3));
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4)
- {
- interleave(data, i, v0, v1, v2, v3);
- v4.scatter(data + 4, i);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6);
- }
- template <typename I>
- static inline void interleave(typename V::EntryType *const data, const I &i,
- const typename V::AsArg v0, const typename V::AsArg v1,
- const typename V::AsArg v2, const typename V::AsArg v3,
- const typename V::AsArg v4, const typename V::AsArg v5,
- const typename V::AsArg v6, const typename V::AsArg v7)
- {
- interleave(data, i, v0, v1, v2, v3);
- interleave(data + 4, i, v4, v5, v6, v7);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1)
- {
- using namespace Vc::AVX;
- const m256d ab02 = concat(_mm_loadu_pd(&data[i[0]]), _mm_loadu_pd(&data[i[2]]));
- const m256d ab13 = concat(_mm_loadu_pd(&data[i[1]]), _mm_loadu_pd(&data[i[3]]));
- v0.data() = _mm256_unpacklo_pd(ab02, ab13);
- v1.data() = _mm256_unpackhi_pd(ab02, ab13);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2)
- {
- v2.gather(data + 2, i);
- deinterleave(data, i, v0, v1);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3)
- {
- deinterleave(data, i, v0, v1);
- deinterleave(data + 2, i, v2, v3);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
- {
- v4.gather(data + 4, i);
- deinterleave(data, i, v0, v1);
- deinterleave(data + 2, i, v2, v3);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
- {
- deinterleave(data, i, v0, v1);
- deinterleave(data + 2, i, v2, v3);
- deinterleave(data + 4, i, v4, v5);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
- {
- v6.gather(data + 6, i);
- deinterleave(data, i, v0, v1);
- deinterleave(data + 2, i, v2, v3);
- deinterleave(data + 4, i, v4, v5);
- }
- template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
- const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
- {
- deinterleave(data, i, v0, v1);
- deinterleave(data + 2, i, v2, v3);
- deinterleave(data + 4, i, v4, v5);
- deinterleave(data + 6, i, v6, v7);
- }
- };
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename T> class Mask<T, VectorAbi::Avx>
- {
- public:
- using abi = VectorAbi::Avx;
- typedef bool EntryType;
- using value_type = EntryType;
- using MaskBool = Common::MaskBool<sizeof(T)>;
- using VectorEntryType = MaskBool;
- using Vector = AVX2::Vector<T>;
- using VectorTypeF = AVX::FloatVectorType<typename AVX::VectorTypeHelper<T>::Type>;
- using VectorTypeD = AVX::DoubleVectorType<VectorTypeF>;
- using VectorTypeI = AVX::IntegerVectorType<VectorTypeF>;
- private:
- typedef const VectorTypeF VArg;
- typedef const VectorTypeD VdArg;
- typedef const VectorTypeI ViArg;
- public:
- static constexpr size_t Size = sizeof(VectorTypeF) / sizeof(T);
- static constexpr size_t MemoryAlignment = Size;
- static constexpr std::size_t size() { return Size; }
- Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));
- private:
- typedef Common::Storage<T, Size> Storage;
- public:
- using VectorType = typename Storage::VectorType;
- using EntryReference = Vc::Detail::ElementReference<Mask>;
- using reference = EntryReference;
- #if defined Vc_MSVC && defined _WIN32
- typedef const Mask &AsArg;
- #else
- typedef const Mask AsArg;
- #endif
- Vc_INTRINSIC Mask() {}
- Vc_INTRINSIC Mask(VArg x) : d(AVX::avx_cast<VectorType>(x)) {}
- Vc_INTRINSIC Mask(VdArg x) : d(AVX::avx_cast<VectorType>(x)) {}
- Vc_INTRINSIC Mask(ViArg x) : d(AVX::avx_cast<VectorType>(x)) {}
- Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : d(Detail::zero<VectorType>()) {}
- Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : d(Detail::allone<VectorType>()) {}
- Vc_INTRINSIC explicit Mask(bool b)
- : d(b ? Detail::allone<VectorType>() : Detail::zero<VectorType>())
- {
- }
- Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; }
- Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; }
- template <typename U>
- Vc_INTRINSIC Mask(
- U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
- : d(AVX::avx_cast<VectorType>(
- Detail::mask_cast<Traits::decay<U>::Size, Size, VectorTypeF>(
- rhs.dataI())))
- {
- }
- #if Vc_IS_VERSION_1
- template <typename U>
- Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
- "mask types") Vc_INTRINSIC
- explicit Mask(U &&rhs,
- Common::enable_if_mask_converts_explicitly<T, U> = nullarg);
- #endif
- template<typename Flags = DefaultLoadTag> Vc_INTRINSIC explicit Mask(const bool *mem, Flags f = Flags()) { load(mem, f); }
- template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void load(const bool *mem, Flags = Flags());
- template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void store(bool *mem, Flags = Flags()) const;
- Vc_INTRINSIC Mask &operator=(const Mask &) = default;
- Vc_INTRINSIC_L Mask &operator=(const std::array<bool, Size> &values) Vc_INTRINSIC_R;
- Vc_INTRINSIC_L operator std::array<bool, Size>() const Vc_INTRINSIC_R;
- Vc_INTRINSIC Vc_PURE bool operator==(const Mask &rhs) const
- { return Detail::movemask(d.v()) == Detail::movemask(rhs.d.v()); }
- Vc_INTRINSIC Vc_PURE bool operator!=(const Mask &rhs) const
- { return !operator==(rhs); }
- Vc_INTRINSIC Mask operator!() const
- {
- #ifdef Vc_GCC
- return ~dataI();
- #else
- return Detail::andnot_(dataF(), Detail::allone<VectorTypeF>());
- #endif
- }
- Vc_INTRINSIC Mask &operator&=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::and_(data(), rhs.data())); return *this; }
- Vc_INTRINSIC Mask &operator|=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::or_ (data(), rhs.data())); return *this; }
- Vc_INTRINSIC Mask &operator^=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::xor_(data(), rhs.data())); return *this; }
- Vc_INTRINSIC Vc_PURE Mask operator&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
- Vc_INTRINSIC Vc_PURE Mask operator|(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
- Vc_INTRINSIC Vc_PURE Mask operator^(const Mask &rhs) const { return Detail::xor_(data(), rhs.data()); }
- Vc_INTRINSIC Vc_PURE Mask operator&&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
- Vc_INTRINSIC Vc_PURE Mask operator||(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
- Vc_INTRINSIC_L bool isNotEmpty() const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L bool isEmpty() const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L bool isFull() const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L bool isMix() const Vc_INTRINSIC_R;
- Vc_INTRINSIC Vc_PURE int shiftMask() const { return Detail::movemask(dataI()); }
- Vc_INTRINSIC Vc_PURE int toInt() const { return Detail::mask_to_int<Size>(dataI()); }
- Vc_INTRINSIC VectorType data () const { return d.v(); }
- Vc_INTRINSIC VectorTypeF dataF() const { return AVX::avx_cast<VectorTypeF>(d.v()); }
- Vc_INTRINSIC VectorTypeI dataI() const { return AVX::avx_cast<VectorTypeI>(d.v()); }
- Vc_INTRINSIC VectorTypeD dataD() const { return AVX::avx_cast<VectorTypeD>(d.v()); }
- private:
- friend reference;
- static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept
- {
- return m.toInt() & (1 << i);
- }
- template <typename U>
- static Vc_INTRINSIC void set(Mask &m, int i,
- U &&v) noexcept(noexcept(MaskBool(std::declval<U>())))
- {
- m.d.set(i, MaskBool(std::forward<U>(v)));
- }
- public:
- Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
- {
- return {*this, int(index)};
- }
- Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept
- {
- return get(*this, index);
- }
- Vc_INTRINSIC Vc_PURE int count() const { return Detail::popcnt16(toInt()); }
- Vc_INTRINSIC Vc_PURE int firstOne() const { return _bit_scan_forward(toInt()); }
- template <typename G> static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R;
- private:
- #ifdef Vc_COMPILE_BENCHMARKS
- public:
- #endif
- Storage d;
- };
- template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::Size;
- template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::MemoryAlignment;
- }
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename T>
- template <typename Flags>
- Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::store(bool *mem, Flags f) const
- {
- Detail::mask_store<Size>(dataI(), mem, f);
- }
- template <typename T>
- template <typename Flags>
- Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::load(const bool *mem, Flags f)
- {
- d.v() = AVX::avx_cast<VectorType>(Detail::mask_load<VectorTypeF, Size>(mem, f));
- }
- #ifdef Vc_IMPL_AVX2
- template <>
- Vc_INTRINSIC Vc_PURE bool AVX2::Mask<int16_t>::get(const AVX2::Mask<int16_t> &m,
- int index) noexcept
- {
- return m.shiftMask() & (1 << 2 * index);
- }
- template <>
- Vc_INTRINSIC Vc_PURE bool AVX2::Mask<uint16_t>::get(const AVX2::Mask<uint16_t> &m,
- int index) noexcept
- {
- return m.shiftMask() & (1 << 2 * index);
- }
- #endif
- template <> Vc_INTRINSIC Vc_PURE bool AVX2::double_m::operator==(const AVX2::double_m &rhs) const
- { return Detail::movemask(dataD()) == Detail::movemask(rhs.dataD()); }
- #ifdef Vc_IMPL_AVX2
- template <> Vc_INTRINSIC Vc_PURE bool AVX2::short_m::operator==(const AVX2::short_m &rhs) const
- { return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
- template <> Vc_INTRINSIC Vc_PURE bool AVX2::ushort_m::operator==(const AVX2::ushort_m &rhs) const
- { return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
- #endif
- template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isFull() const {
- if (sizeof(T) == 8) {
- return 0 != Detail::testc(dataD(), Detail::allone<VectorTypeD>());
- } else if (sizeof(T) == 4) {
- return 0 != Detail::testc(dataF(), Detail::allone<VectorTypeF>());
- } else {
- return 0 != Detail::testc(dataI(), Detail::allone<VectorTypeI>());
- }
- }
- template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isNotEmpty() const {
- if (sizeof(T) == 8) {
- return 0 == Detail::testz(dataD(), dataD());
- } else if (sizeof(T) == 4) {
- return 0 == Detail::testz(dataF(), dataF());
- } else {
- return 0 == Detail::testz(dataI(), dataI());
- }
- }
- template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isEmpty() const {
- if (sizeof(T) == 8) {
- return 0 != Detail::testz(dataD(), dataD());
- } else if (sizeof(T) == 4) {
- return 0 != Detail::testz(dataF(), dataF());
- } else {
- return 0 != Detail::testz(dataI(), dataI());
- }
- }
- template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isMix() const {
- if (sizeof(T) == 8) {
- return 0 != Detail::testnzc(dataD(), Detail::allone<VectorTypeD>());
- } else if (sizeof(T) == 4) {
- return 0 != Detail::testnzc(dataF(), Detail::allone<VectorTypeF>());
- } else {
- return 0 != Detail::testnzc(dataI(), Detail::allone<VectorTypeI>());
- }
- }
- template <typename M, typename G>
- Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4 + 32>)
- {
- return _mm256_setr_epi64x(
- gen(0) ? 0xffffffffffffffffull : 0, gen(1) ? 0xffffffffffffffffull : 0,
- gen(2) ? 0xffffffffffffffffull : 0, gen(3) ? 0xffffffffffffffffull : 0);
- }
- template <typename M, typename G>
- Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8 + 32>)
- {
- return _mm256_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
- gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0,
- gen(4) ? 0xfffffffful : 0, gen(5) ? 0xfffffffful : 0,
- gen(6) ? 0xfffffffful : 0, gen(7) ? 0xfffffffful : 0);
- }
- template <typename M, typename G>
- Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 16 + 32>)
- {
- return _mm256_setr_epi16(gen(0) ? 0xfffful : 0, gen(1) ? 0xfffful : 0,
- gen(2) ? 0xfffful : 0, gen(3) ? 0xfffful : 0,
- gen(4) ? 0xfffful : 0, gen(5) ? 0xfffful : 0,
- gen(6) ? 0xfffful : 0, gen(7) ? 0xfffful : 0,
- gen(8) ? 0xfffful : 0, gen(9) ? 0xfffful : 0,
- gen(10) ? 0xfffful : 0, gen(11) ? 0xfffful : 0,
- gen(12) ? 0xfffful : 0, gen(13) ? 0xfffful : 0,
- gen(14) ? 0xfffful : 0, gen(15) ? 0xfffful : 0);
- }
- template <typename T>
- template <typename G>
- Vc_INTRINSIC AVX2::Mask<T> Mask<T, VectorAbi::Avx>::generate(G &&gen)
- {
- return generate_impl<AVX2::Mask<T>>(std::forward<G>(gen),
- std::integral_constant<int, Size + sizeof(Storage)>());
- }
- template <typename T> Vc_INTRINSIC Vc_PURE AVX2::Mask<T> Mask<T, VectorAbi::Avx>::shifted(int amount) const
- {
- switch (amount * int(sizeof(VectorEntryType))) {
- case 0: return *this;
- case 1: return Detail::shifted< 1>(dataI());
- case 2: return Detail::shifted< 2>(dataI());
- case 3: return Detail::shifted< 3>(dataI());
- case 4: return Detail::shifted< 4>(dataI());
- case 5: return Detail::shifted< 5>(dataI());
- case 6: return Detail::shifted< 6>(dataI());
- case 7: return Detail::shifted< 7>(dataI());
- case 8: return Detail::shifted< 8>(dataI());
- case 9: return Detail::shifted< 9>(dataI());
- case 10: return Detail::shifted< 10>(dataI());
- case 11: return Detail::shifted< 11>(dataI());
- case 12: return Detail::shifted< 12>(dataI());
- case 13: return Detail::shifted< 13>(dataI());
- case 14: return Detail::shifted< 14>(dataI());
- case 15: return Detail::shifted< 15>(dataI());
- case 16: return Detail::shifted< 16>(dataI());
- case 17: return Detail::shifted< 17>(dataI());
- case 18: return Detail::shifted< 18>(dataI());
- case 19: return Detail::shifted< 19>(dataI());
- case 20: return Detail::shifted< 20>(dataI());
- case 21: return Detail::shifted< 21>(dataI());
- case 22: return Detail::shifted< 22>(dataI());
- case 23: return Detail::shifted< 23>(dataI());
- case 24: return Detail::shifted< 24>(dataI());
- case 25: return Detail::shifted< 25>(dataI());
- case 26: return Detail::shifted< 26>(dataI());
- case 27: return Detail::shifted< 27>(dataI());
- case 28: return Detail::shifted< 28>(dataI());
- case 29: return Detail::shifted< 29>(dataI());
- case 30: return Detail::shifted< 30>(dataI());
- case 31: return Detail::shifted< 31>(dataI());
- case -1: return Detail::shifted< -1>(dataI());
- case -2: return Detail::shifted< -2>(dataI());
- case -3: return Detail::shifted< -3>(dataI());
- case -4: return Detail::shifted< -4>(dataI());
- case -5: return Detail::shifted< -5>(dataI());
- case -6: return Detail::shifted< -6>(dataI());
- case -7: return Detail::shifted< -7>(dataI());
- case -8: return Detail::shifted< -8>(dataI());
- case -9: return Detail::shifted< -9>(dataI());
- case -10: return Detail::shifted<-10>(dataI());
- case -11: return Detail::shifted<-11>(dataI());
- case -12: return Detail::shifted<-12>(dataI());
- case -13: return Detail::shifted<-13>(dataI());
- case -14: return Detail::shifted<-14>(dataI());
- case -15: return Detail::shifted<-15>(dataI());
- case -16: return Detail::shifted<-16>(dataI());
- case -17: return Detail::shifted<-17>(dataI());
- case -18: return Detail::shifted<-18>(dataI());
- case -19: return Detail::shifted<-19>(dataI());
- case -20: return Detail::shifted<-20>(dataI());
- case -21: return Detail::shifted<-21>(dataI());
- case -22: return Detail::shifted<-22>(dataI());
- case -23: return Detail::shifted<-23>(dataI());
- case -24: return Detail::shifted<-24>(dataI());
- case -25: return Detail::shifted<-25>(dataI());
- case -26: return Detail::shifted<-26>(dataI());
- case -27: return Detail::shifted<-27>(dataI());
- case -28: return Detail::shifted<-28>(dataI());
- case -29: return Detail::shifted<-29>(dataI());
- case -30: return Detail::shifted<-30>(dataI());
- case -31: return Detail::shifted<-31>(dataI());
- }
- return Zero();
- }
- }
- #endif
- #include <algorithm>
- #include <cmath>
- #ifdef isfinite
- #undef isfinite
- #endif
- #ifdef isnan
- #undef isnan
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <typename T, typename Abi> struct VectorTraits
- {
- using mask_type = Vc::Mask<T, Abi>;
- using vector_type = Vc::Vector<T, Abi>;
- using writemasked_vector_type = Common::WriteMaskedVector<vector_type, mask_type>;
- using intrinsic_type = typename AVX::VectorTypeHelper<T>::Type;
- };
- }
- #define Vc_CURRENT_CLASS_NAME Vector
- template <typename T> class Vector<T, VectorAbi::Avx>
- {
- public:
- using abi = VectorAbi::Avx;
- private:
- using traits_type = Detail::VectorTraits<T, abi>;
- static_assert(
- std::is_arithmetic<T>::value,
- "Vector<T> only accepts arithmetic builtin types as template parameter T.");
- using WriteMaskedVector = typename traits_type::writemasked_vector_type;
- public:
- using VectorType = typename traits_type::intrinsic_type;
- using vector_type = VectorType;
- using mask_type = typename traits_type::mask_type;
- using Mask = mask_type;
- using MaskType = mask_type;
- using MaskArg Vc_DEPRECATED_ALIAS("Use MaskArgument instead.") = typename Mask::AsArg;
- using MaskArgument = typename Mask::AsArg;
- using reference = Detail::ElementReference<Vector>;
- Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));
- using EntryType = T;
- using value_type = EntryType;
- typedef EntryType VectorEntryType;
- static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
- static constexpr size_t MemoryAlignment = alignof(VectorType);
- using IndexType = fixed_size_simd<int, Size>;
- typedef Vector<T, abi> AsArg;
- typedef VectorType VectorTypeArg;
- protected:
- template <typename U> using V = Vector<U, abi>;
- typedef AVX::VectorHelper<VectorType> HV;
- typedef AVX::VectorHelper<T> HT;
- template <typename V> static Vc_INTRINSIC VectorType _cast(V v)
- {
- return AVX::avx_cast<VectorType>(v);
- }
- typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
- StorageType d;
- using WidthT = Common::WidthT<VectorType>;
- public:
- public:
- Vc_INTRINSIC Vector() = default;
- static constexpr std::size_t size() { return Size; }
- explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
- explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
- explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
- static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
- static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
- static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
- {
- return Vector(Vc::IndexesFromZero);
- }
- template <class G, int = 0,
- class = typename std::enable_if<std::is_convertible<
- decltype(std::declval<G>()(size_t())), value_type>::value>::type>
- explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
- {
- }
- static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {}
- template <typename U>
- Vc_INTRINSIC Vector(
- V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
- void *>::type = nullptr)
- : d(AVX::convert<U, T>(x.data()))
- {
- }
- #if Vc_IS_VERSION_1
- template <typename U>
- Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
- "vector types") Vc_INTRINSIC explicit Vector(
- V<U> x,
- typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
- void *>::type = nullptr)
- : d(Detail::zeroExtendIfNeeded(AVX::convert<U, T>(x.data())))
- {
- }
- template <typename U,
- typename = enable_if<Traits::is_simd_vector<U>::value &&
- !std::is_same<Vector, Traits::decay<U>>::value>>
- Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
- "vector types") Vc_INTRINSIC_L
- explicit Vector(U &&x) Vc_INTRINSIC_R;
- #endif
- Vc_INTRINSIC Vector(EntryType a) : d(Detail::avx_broadcast(a)) {}
- template <typename U>
- Vc_INTRINSIC Vector(U a,
- typename std::enable_if<std::is_same<U, int>::value &&
- !std::is_same<U, EntryType>::value,
- void *>::type = nullptr)
- : Vector(static_cast<EntryType>(a))
- {
- }
- explicit Vector(std::initializer_list<EntryType>)
- {
- static_assert(std::is_same<EntryType, void>::value,
- "A SIMD vector object cannot be initialized from an initializer list "
- "because the number of entries in the vector is target-dependent.");
- }
- explicit Vc_INTRINSIC Vector(const EntryType *mem)
- {
- load(mem);
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
- {
- load(mem, flags);
- }
- template <typename U, typename Flags = DefaultLoadTag,
- typename = enable_if<
- (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
- sizeof(EntryType) >= sizeof(U)) &&
- std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
- {
- load<U, Flags>(x, flags);
- }
- Vc_INTRINSIC void load(const EntryType *mem)
- {
- load(mem, DefaultLoadTag());
- }
- template <typename Flags>
- Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
- load(const EntryType *mem, Flags flags)
- {
- load<EntryType, Flags>(mem, flags);
- }
- private:
- template <typename U, typename Flags>
- struct load_concept : public std::enable_if<
- (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
- sizeof(EntryType) >= sizeof(U)) &&
- std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
- {};
- public:
- template <typename U, typename Flags = DefaultLoadTag>
- Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
- template <
- typename U,
- typename Flags = DefaultStoreTag,
- typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
- template <
- typename U,
- typename Flags = DefaultStoreTag,
- typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
- Vc_INTRINSIC void store(EntryType *mem) const
- {
- store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
- {
- store<EntryType, Flags>(mem, flags);
- }
- Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
- {
- store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
- }
- template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
- Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
- {
- store<EntryType, Flags>(mem, mask, flags);
- }
- Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R;
- Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R;
- Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R;
- Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
- Vc_INTRINSIC_L void setQnan(MaskArgument k) Vc_INTRINSIC_R;
- #ifndef Vc_CURRENT_CLASS_NAME
- #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
- #endif
- private:
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
- MaskArgument mask);
- public:
- #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<MT, EntryType>::value, \
- "The memory pointer needs to point to a type that can be converted to the " \
- "EntryType of this SIMD vector type."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT, typename IT,
- typename = enable_if<Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
- private:
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes) const;
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
- public:
- #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<EntryType, MT>::value, \
- "The memory pointer needs to point to a type that the EntryType of this " \
- "SIMD vector type can be converted to."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes));
- }
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes), mask);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
- {
- scatter(args.address, args.indexes);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
- {
- scatter(args.address, args.indexes, mask);
- }
- #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
- #if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
- template <class U, class A, int Scale, int N = Vector<U, A>::size(),
- class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
- Vc_INTRINSIC void gatherImplementation(
- const Common::GatherArguments<T, Vector<U, A>, Scale> &args)
- {
- d.v() = AVX::gather<sizeof(T) * Scale>(
- args.address,
- simd_cast<conditional_t<Size == 4, SSE::int_v, AVX2::int_v>>(args.indexes)
- .data());
- }
- template <class U, class A, int Scale, int N = Vector<U, A>::size(),
- class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
- Vc_INTRINSIC void gatherImplementation(
- const Common::GatherArguments<T, Vector<U, A>, Scale> &args, MaskArgument k)
- {
- d.v() = AVX::gather<sizeof(T) * Scale>(
- d.v(), k.data(), args.address,
- simd_cast<conditional_t<Size == 4, SSE::int_v, AVX2::int_v>>(args.indexes)
- .data());
- }
- template <
- class MT, class U, class A, int Scale,
- class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
- (sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
- Vc_INTRINSIC void gatherImplementation(
- const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
- {
- using AVX2::int_v;
- const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
- const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
- *this = simd_cast<Vector>(int_v(AVX::gather<sizeof(MT) * Scale>(
- aliasing_cast<int>(args.address), idx0)),
- int_v(AVX::gather<sizeof(MT) * Scale>(
- aliasing_cast<int>(args.address), idx1)));
- if (sizeof(MT) == 1) {
- if (std::is_signed<MT>::value) {
- using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
- *this = (simd_cast<Signed>(*this) << 8) >> 8;
- } else {
- *this &= 0xff;
- }
- }
- }
- template <
- class MT, class U, class A, int Scale,
- class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
- (sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
- Vc_INTRINSIC void gatherImplementation(
- const Common::GatherArguments<MT, Vector<U, A>, Scale> &args, MaskArgument k)
- {
- using AVX2::int_v;
- const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
- const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
- const auto k0 = simd_cast<AVX2::int_m, 0>(k).data();
- const auto k1 = simd_cast<AVX2::int_m, 1>(k).data();
- auto v = simd_cast<Vector>(
- int_v(AVX::gather<sizeof(MT) * Scale>(
- _mm256_setzero_si256(), k0, aliasing_cast<int>(args.address), idx0)),
- int_v(AVX::gather<sizeof(MT) * Scale>(
- _mm256_setzero_si256(), k1, aliasing_cast<int>(args.address), idx1)));
- if (sizeof(MT) == 1) {
- if (std::is_signed<MT>::value) {
- using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
- v = (simd_cast<Signed>(v) << 8) >> 8;
- } else {
- v &= 0xff;
- }
- }
- assign(v, k);
- }
- template <class MT, class U, class A, int Scale>
- Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
- Traits::is_valid_vector_argument<MT>::value &&
- !std::is_same<MT, T>::value &&
- Vector<U, A>::size() >= size()),
- void>
- gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
- {
- *this = simd_cast<Vector>(fixed_size_simd<MT, Size>(args));
- }
- template <class MT, class U, class A, int Scale>
- Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
- Traits::is_valid_vector_argument<MT>::value &&
- !std::is_same<MT, T>::value &&
- Vector<U, A>::size() >= size()),
- void>
- gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args,
- MaskArgument k)
- {
- assign(simd_cast<Vector>(fixed_size_simd<MT, Size>(args, k)), k);
- }
- #endif
- Vc_ALWAYS_INLINE Vector &operator++() { data() = Detail::add(data(), Detail::one(T()), T()); return *this; }
- Vc_ALWAYS_INLINE Vector &operator--() { data() = Detail::sub(data(), Detail::one(T()), T()); return *this; }
- Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = Detail::add(data(), Detail::one(T()), T()); return r; }
- Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = Detail::sub(data(), Detail::one(T()), T()); return r; }
- private:
- friend reference;
- Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
- {
- return o.d.m(i);
- }
- template <typename U>
- Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
- noexcept(std::declval<value_type &>() = v))
- {
- return o.d.set(i, v);
- }
- public:
- Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
- {
- static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
- return {*this, int(index)};
- }
- Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
- {
- return d.m(index);
- }
- Vc_INTRINSIC_L Vc_PURE_L Vector operator[](Permutation::ReversedTag) const Vc_INTRINSIC_R Vc_PURE_R;
- Vc_INTRINSIC_L Vc_PURE_L Vector operator[](const IndexType &perm) const Vc_INTRINSIC_R Vc_PURE_R;
- Vc_INTRINSIC Vc_PURE Mask operator!() const
- {
- return *this == Zero();
- }
- Vc_ALWAYS_INLINE Vector operator~() const
- {
- #ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
- static_assert(std::is_integral<T>::value,
- "bit-complement can only be used with Vectors of integral type");
- #endif
- return Detail::andnot_(data(), Detail::allone<VectorType>());
- }
- Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
- Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; }
- #define Vc_OP_VEC(op) \
- Vc_INTRINSIC Vector &operator op##=(AsArg x); \
- Vc_INTRINSIC Vc_PURE Vector operator op(AsArg x) const \
- { \
- static_assert( \
- std::is_integral<T>::value, \
- "bitwise-operators can only be used with Vectors of integral type"); \
- }
- Vc_ALL_SHIFTS(Vc_OP_VEC);
- #undef Vc_OP_VEC
- Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R;
- Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
- isNegative() const
- {
- return Vc::isnegative(*this);
- }
- Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) {
- data() = Detail::blend(data(), v.data(), mask.data());
- }
- template <typename V2>
- Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2
- staticCast() const
- {
- return V2(*this);
- }
- template <typename V2>
- Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2
- reinterpretCast() const
- {
- return AVX::avx_cast<typename V2::VectorType>(data());
- }
- Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k)
- {
- return {*this, k};
- }
- Vc_ALWAYS_INLINE VectorType &data() { return d.v(); }
- Vc_ALWAYS_INLINE const VectorType &data() const { return d.v(); }
- template<int Index>
- Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L std::pair<Vector, int> minIndex() const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L std::pair<Vector, int> maxIndex() const Vc_INTRINSIC_R;
- Vc_ALWAYS_INLINE EntryType min() const { return Detail::min(data(), T()); }
- Vc_ALWAYS_INLINE EntryType max() const { return Detail::max(data(), T()); }
- Vc_ALWAYS_INLINE EntryType product() const { return Detail::mul(data(), T()); }
- Vc_ALWAYS_INLINE EntryType sum() const { return Detail::add(data(), T()); }
- Vc_ALWAYS_INLINE_L Vector partialSum() const Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L EntryType min(MaskArgument m) const Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L EntryType max(MaskArgument m) const Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L EntryType product(MaskArgument m) const Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L EntryType sum(MaskArgument m) const Vc_ALWAYS_INLINE_R;
- Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R;
- Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
- template <typename F> void callWithValuesSorted(F &&f)
- {
- EntryType value = d.m(0);
- f(value);
- for (size_t i = 1; i < Size; ++i) {
- if (d.m(i) != value) {
- value = d.m(i);
- f(value);
- }
- }
- }
- template <typename F> Vc_INTRINSIC void call(F &&f) const
- {
- Common::for_all_vector_entries<Size>([&](size_t i) { f(EntryType(d.m(i))); });
- }
- template <typename F> Vc_INTRINSIC void call(F &&f, const Mask &mask) const
- {
- for (size_t i : where(mask)) {
- f(EntryType(d.m(i)));
- }
- }
- template <typename F> Vc_INTRINSIC Vector apply(F &&f) const
- {
- Vector r;
- Common::for_all_vector_entries<Size>(
- [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); });
- return r;
- }
- template <typename F> Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const
- {
- Vector r(*this);
- for (size_t i : where(mask)) {
- r.d.set(i, f(EntryType(r.d.m(i))));
- }
- return r;
- }
- template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
- Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f(i)); });
- }
- Vc_INTRINSIC void fill(EntryType (&f)()) {
- Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f()); });
- }
- template <typename G> static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R;
- Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector
- copySign(AsArg x) const
- {
- return Vc::copysign(*this, x);
- }
- Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
- {
- Vc::exponent(*this);
- }
- Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R;
- Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R;
- };
- #undef Vc_CURRENT_CLASS_NAME
- template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::Size;
- template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::MemoryAlignment;
- #define Vc_CONDITIONAL_ASSIGN(name_,op_) \
- template <Operator O, typename T, typename M, typename U> \
- Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
- AVX2::Vector<T> &lhs, M &&mask, U &&rhs) \
- { \
- lhs(mask) op_ rhs; \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_CONDITIONAL_ASSIGN( Assign, =);
- Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
- Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
- Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
- Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
- Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
- Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
- Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
- Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
- Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
- Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
- #undef Vc_CONDITIONAL_ASSIGN
- #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
- template <Operator O, typename T, typename M> \
- Vc_INTRINSIC enable_if<O == Operator::name_, AVX2::Vector<T>> conditional_assign( \
- AVX2::Vector<T> &lhs, M &&mask) \
- { \
- return expr_; \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
- Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
- Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
- Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
- #undef Vc_CONDITIONAL_ASSIGN
- }
- #ifndef VC_AVX_LIMITS_H_
- #define VC_AVX_LIMITS_H_
- namespace std
- {
- #define Vc_NUM_LIM(T,_max,_min) \
- template <> struct numeric_limits<Vc::AVX2::Vector<T>> : public numeric_limits<T> { \
- static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> max() Vc_NOEXCEPT \
- { \
- return _max; \
- } \
- static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> min() Vc_NOEXCEPT \
- { \
- return _min; \
- } \
- static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> lowest() Vc_NOEXCEPT \
- { \
- return min(); \
- } \
- static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> epsilon() Vc_NOEXCEPT \
- { \
- return Vc::AVX2::Vector<T>::Zero(); \
- } \
- static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> round_error() Vc_NOEXCEPT \
- { \
- return Vc::AVX2::Vector<T>::Zero(); \
- } \
- static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> infinity() Vc_NOEXCEPT \
- { \
- return Vc::AVX2::Vector<T>::Zero(); \
- } \
- static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> quiet_NaN() Vc_NOEXCEPT \
- { \
- return Vc::AVX2::Vector<T>::Zero(); \
- } \
- static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> signaling_NaN() Vc_NOEXCEPT \
- { \
- return Vc::AVX2::Vector<T>::Zero(); \
- } \
- static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> denorm_min() Vc_NOEXCEPT \
- { \
- return Vc::AVX2::Vector<T>::Zero(); \
- } \
- }
- #ifdef Vc_IMPL_AVX2
- Vc_NUM_LIM(unsigned short, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
- Vc_NUM_LIM( short, _mm256_srli_epi16(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi16());
- Vc_NUM_LIM( unsigned int, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
- Vc_NUM_LIM( int, _mm256_srli_epi32(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi32());
- #endif
- #undef Vc_NUM_LIM
- }
- #endif
- #ifndef VC_AVX_CONST_H_
- #define VC_AVX_CONST_H_
- #include <cstddef>
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace AVX
- {
- template<typename T> struct IndexesFromZeroData;
- template<> struct IndexesFromZeroData<int> {
- static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast<const int *>(&_IndexesFromZero32[0]); }
- };
- template<> struct IndexesFromZeroData<unsigned int> {
- static Vc_ALWAYS_INLINE Vc_CONST const unsigned int *address() { return &_IndexesFromZero32[0]; }
- };
- template<> struct IndexesFromZeroData<short> {
- static Vc_ALWAYS_INLINE Vc_CONST const short *address() { return reinterpret_cast<const short *>(&_IndexesFromZero16[0]); }
- };
- template<> struct IndexesFromZeroData<unsigned short> {
- static Vc_ALWAYS_INLINE Vc_CONST const unsigned short *address() { return &_IndexesFromZero16[0]; }
- };
- template<> struct IndexesFromZeroData<signed char> {
- static Vc_ALWAYS_INLINE Vc_CONST const signed char *address() { return reinterpret_cast<const signed char *>(&_IndexesFromZero8[0]); }
- };
- template<> struct IndexesFromZeroData<char> {
- static Vc_ALWAYS_INLINE Vc_CONST const char *address() { return reinterpret_cast<const char *>(&_IndexesFromZero8[0]); }
- };
- template<> struct IndexesFromZeroData<unsigned char> {
- static Vc_ALWAYS_INLINE Vc_CONST const unsigned char *address() { return &_IndexesFromZero8[0]; }
- };
- template<typename _T> struct Const
- {
- typedef Vector<_T> V;
- typedef typename V::EntryType T;
- typedef typename V::Mask M;
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return V(c_trig<T>::data[0]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return V(c_trig<T>::data[1]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return V(c_trig<T>::data[2]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return V(c_trig<T>::data[3]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return V(c_trig<T>::data[4]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _16() { return V(c_trig<T>::data[5]); }
- static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return V(c_trig<T>::data[(12 + i)]); }
- static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return V(c_trig<T>::data[(17 + i)]); }
- static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return V(c_trig<T>::data[22]); }
- static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return V(c_trig<T>::data[23]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return V(c_trig<T>::data[24]); }
- static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return V(c_trig<T>::data[8]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return V(c_trig<T>::data[9]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return V(c_trig<T>::data[10]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return V(c_trig<T>::data[11]); }
- static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return V(c_trig<T>::data[(28 + i)]); }
- static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return V(c_trig<T>::data[(33 + i)]); }
- static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return V(c_trig<T>::data[(37 + i)]); }
- static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return V(c_trig<T>::data[(43 + i)]); }
- static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return V(c_trig<T>::data[25]); }
- static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return V(c_trig<T>::data[26]); }
- static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(V(c_log<T>::d(1)).data()); }
- static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return V(c_log<T>::d(18)); }
- static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return V(c_log<T>::d(15)); }
- static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return V(c_log<T>::d(2 + i)); }
- static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return V(c_log<T>::d(8 + i)); }
- static Vc_ALWAYS_INLINE Vc_CONST V min() { return V(c_log<T>::d(14)); }
- static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return V(c_log<T>::d(17)); }
- static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return V(c_log<T>::d(16)); }
- static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return V(c_log<T>::d(13)); }
- static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return V(c_log<T>::d(19)); }
- static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return V(c_log<T>::d(20)); }
- static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R;
- static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R;
- };
- template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask()
- {
- return _mm256_broadcast_ss(
- reinterpret_cast<const float *>(&c_general::highMaskFloat));
- }
- template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask()
- {
- return _mm256_broadcast_sd(
- reinterpret_cast<const double *>(&c_general::highMaskDouble));
- }
- template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask(int bits)
- {
- #ifdef Vc_IMPL_AVX2
- #if defined Vc_ICC || defined Vc_MSVC
- __m256i allone;
- allone = _mm256_cmpeq_epi8(allone, allone);
- #else
- auto allone = ~__m256i();
- #endif
- return _mm256_castsi256_ps(_mm256_slli_epi32(allone, bits));
- #else
- __m128 tmp = _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits));
- return concat(tmp, tmp);
- #endif
- }
- template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask(int bits)
- {
- #ifdef Vc_IMPL_AVX2
- #if defined Vc_ICC || defined Vc_MSVC
- __m256i allone;
- allone = _mm256_cmpeq_epi8(allone, allone);
- #else
- auto allone = ~__m256i();
- #endif
- return _mm256_castsi256_pd(_mm256_slli_epi64(allone, bits));
- #else
- __m128d tmp = _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits));
- return concat(tmp, tmp);
- #endif
- }
- }
- namespace AVX2
- {
- using AVX::IndexesFromZeroData;
- using AVX::Const;
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); }
- Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); }
- Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); }
- Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); }
- Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); }
- Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); }
- #ifdef Vc_IMPL_AVX2
- Vc_INTRINSIC AVX2:: int_m operator==(AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: uint_m operator==(AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
- Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: int_m operator!=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
- Vc_INTRINSIC AVX2:: uint_m operator!=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
- Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
- Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
- Vc_INTRINSIC AVX2:: int_m operator>=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); }
- Vc_INTRINSIC AVX2:: uint_m operator>=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); }
- Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); }
- Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); }
- Vc_INTRINSIC AVX2:: int_m operator<=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); }
- Vc_INTRINSIC AVX2:: uint_m operator<=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); }
- Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); }
- Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); }
- Vc_INTRINSIC AVX2:: int_m operator> (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: uint_m operator> (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); }
- Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: int_m operator< (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: uint_m operator< (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); }
- Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); }
- Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); }
- #endif
- template <typename T>
- Vc_INTRINSIC AVX2::Vector<T> operator^(AVX2::Vector<T> a, AVX2::Vector<T> b)
- {
- return xor_(a.data(), b.data());
- }
- template <typename T>
- Vc_INTRINSIC AVX2::Vector<T> operator&(AVX2::Vector<T> a, AVX2::Vector<T> b)
- {
- return and_(a.data(), b.data());
- }
- template <typename T>
- Vc_INTRINSIC AVX2::Vector<T> operator|(AVX2::Vector<T> a, AVX2::Vector<T> b)
- {
- return or_(a.data(), b.data());
- }
- template <typename T>
- Vc_INTRINSIC AVX2::Vector<T> operator+(AVX2::Vector<T> a, AVX2::Vector<T> b)
- {
- return add(a.data(), b.data(), T());
- }
- template <typename T>
- Vc_INTRINSIC AVX2::Vector<T> operator-(AVX2::Vector<T> a, AVX2::Vector<T> b)
- {
- return sub(a.data(), b.data(), T());
- }
- template <typename T>
- Vc_INTRINSIC AVX2::Vector<T> operator*(AVX2::Vector<T> a, AVX2::Vector<T> b)
- {
- return mul(a.data(), b.data(), T());
- }
- template <typename T>
- Vc_INTRINSIC AVX2::Vector<T> operator/(AVX2::Vector<T> a, AVX2::Vector<T> b)
- {
- return div(a.data(), b.data(), T());
- }
- Vc_INTRINSIC AVX2::Vector<ushort> operator/(AVX2::Vector<ushort> a,
- AVX2::Vector<ushort> b)
- {
- using namespace AVX;
- const __m256 lo = _mm256_div_ps(convert<ushort, float>(lo128(a.data())),
- convert<ushort, float>(lo128(b.data())));
- const __m256 hi = _mm256_div_ps(convert<ushort, float>(hi128(a.data())),
- convert<ushort, float>(hi128(b.data())));
- const float_v threshold = 32767.f;
- using Detail::operator>;
- const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty()))
- ? convert<float, ushort>(lo)
- : convert<float, short>(lo);
- const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty()))
- ? convert<float, ushort>(hi)
- : convert<float, short>(hi);
- return concat(loShort, hiShort);
- }
- template <typename T>
- Vc_INTRINSIC enable_if<std::is_integral<T>::value, AVX2::Vector<T>> operator%(
- AVX2::Vector<T> a, AVX2::Vector<T> b)
- {
- return a - a / b * b;
- }
- }
- template <> template <typename G> Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3);
- }
- template <> template <typename G> Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- const auto tmp4 = gen(4);
- const auto tmp5 = gen(5);
- const auto tmp6 = gen(6);
- const auto tmp7 = gen(7);
- return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
- }
- #ifdef Vc_IMPL_AVX2
- template <> template <typename G> Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- const auto tmp4 = gen(4);
- const auto tmp5 = gen(5);
- const auto tmp6 = gen(6);
- const auto tmp7 = gen(7);
- return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
- }
- template <> template <typename G> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- const auto tmp4 = gen(4);
- const auto tmp5 = gen(5);
- const auto tmp6 = gen(6);
- const auto tmp7 = gen(7);
- return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
- }
- template <> template <typename G> Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- const auto tmp4 = gen(4);
- const auto tmp5 = gen(5);
- const auto tmp6 = gen(6);
- const auto tmp7 = gen(7);
- const auto tmp8 = gen(8);
- const auto tmp9 = gen(9);
- const auto tmp10 = gen(10);
- const auto tmp11 = gen(11);
- const auto tmp12 = gen(12);
- const auto tmp13 = gen(13);
- const auto tmp14 = gen(14);
- const auto tmp15 = gen(15);
- return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
- }
- template <> template <typename G> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen)
- {
- const auto tmp0 = gen(0);
- const auto tmp1 = gen(1);
- const auto tmp2 = gen(2);
- const auto tmp3 = gen(3);
- const auto tmp4 = gen(4);
- const auto tmp5 = gen(5);
- const auto tmp6 = gen(6);
- const auto tmp7 = gen(7);
- const auto tmp8 = gen(8);
- const auto tmp9 = gen(9);
- const auto tmp10 = gen(10);
- const auto tmp11 = gen(11);
- const auto tmp12 = gen(12);
- const auto tmp13 = gen(13);
- const auto tmp14 = gen(14);
- const auto tmp15 = gen(15);
- return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
- }
- #endif
- template <typename T> Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(VectorSpecialInitializerZero) : d{} {}
- template <> Vc_INTRINSIC Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {}
- template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {}
- #ifdef Vc_IMPL_AVX2
- template <> Vc_INTRINSIC Vector< int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {}
- template <> Vc_INTRINSIC Vector< uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {}
- template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {}
- template <> Vc_INTRINSIC Vector<ushort, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {}
- template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {}
- template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {}
- #endif
- template <typename T>
- Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx>::Vector(
- VectorSpecialInitializerIndexesFromZero)
- : Vector(AVX::IndexesFromZeroData<T>::address(), Vc::Aligned)
- {
- }
- template <>
- Vc_ALWAYS_INLINE Vector<float, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
- : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
- {
- }
- template <>
- Vc_ALWAYS_INLINE Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
- : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
- {
- }
- template <typename DstT>
- template <typename SrcT, typename Flags>
- Vc_INTRINSIC typename Vector<DstT, VectorAbi::Avx>::
- #ifndef Vc_MSVC
- template
- #endif
- load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Avx>::load(const SrcT *mem, Flags flags)
- {
- Common::handleLoadPrefetches(mem, flags);
- d.v() = Detail::load<VectorType, DstT>(mem, flags);
- }
- template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero()
- {
- data() = Detail::zero<VectorType>();
- }
- template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero(const Mask &k)
- {
- data() = Detail::andnot_(k.data(), data());
- }
- template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZeroInverted(const Mask &k)
- {
- data() = Detail::and_(k.data(), data());
- }
- template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan()
- {
- data() = Detail::allone<VectorType>();
- }
- template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan(MaskArgument k)
- {
- data() = _mm256_or_pd(data(), k.dataD());
- }
- template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan()
- {
- data() = Detail::allone<VectorType>();
- }
- template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan(MaskArgument k)
- {
- data() = _mm256_or_ps(data(), k.dataF());
- }
- template <typename T>
- template <typename U,
- typename Flags,
- typename>
- Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Flags flags) const
- {
- Common::handleStorePrefetches(mem, flags);
- HV::template store<Flags>(mem, data());
- }
- template <typename T>
- template <typename U,
- typename Flags,
- typename>
- Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Mask mask, Flags flags) const
- {
- Common::handleStorePrefetches(mem, flags);
- HV::template store<Flags>(mem, data(), mask.data());
- }
- #ifdef Vc_IMPL_AVX2
- template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
- template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
- template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); }
- template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); }
- template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
- template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
- template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
- template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
- template <typename T>
- Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(AsArg x)
- {
- static_assert(std::is_integral<T>::value,
- "bitwise-operators can only be used with Vectors of integral type");
- return *this = *this << x;
- }
- template <typename T>
- Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(AsArg x)
- {
- static_assert(std::is_integral<T>::value,
- "bitwise-operators can only be used with Vectors of integral type");
- return *this = *this >> x;
- }
- #endif
- template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(int shift) {
- d.v() = Detail::shiftRight(d.v(), shift, T());
- return *static_cast<AVX2::Vector<T> *>(this);
- }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator>>(int shift) const {
- return Detail::shiftRight(d.v(), shift, T());
- }
- template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(int shift) {
- d.v() = Detail::shiftLeft(d.v(), shift, T());
- return *static_cast<AVX2::Vector<T> *>(this);
- }
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator<<(int shift) const {
- return Detail::shiftLeft(d.v(), shift, T());
- }
- Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x)
- {
- return AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
- AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data()))));
- }
- Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
- {
- return Mem::permute<X1, X1, X3, X3>(AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
- AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
- }
- #define Vc_GATHER_IMPL(V_) \
- template <> \
- template <class MT, class IT, int Scale> \
- inline void AVX2::V_::gatherImplementation( \
- const Common::GatherArguments<MT, IT, Scale> &args)
- #define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
- Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
- Vc_GATHER_IMPL(float_v)
- {
- d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6),
- Vc_M(7));
- }
- #ifdef Vc_IMPL_AVX2
- Vc_GATHER_IMPL(int_v)
- {
- d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
- Vc_M(6), Vc_M(7));
- }
- Vc_GATHER_IMPL(uint_v)
- {
- d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
- Vc_M(6), Vc_M(7));
- }
- Vc_GATHER_IMPL(short_v)
- {
- d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
- Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
- Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
- }
- Vc_GATHER_IMPL(ushort_v)
- {
- d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
- Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
- Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
- }
- #endif
- #undef Vc_M
- #undef Vc_GATHER_IMPL
- template <class T>
- template <class MT, class IT, int Scale>
- inline void Vector<T, VectorAbi::Avx>::gatherImplementation(
- const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
- {
- const auto *mem = args.address;
- const auto indexes = Scale * args.indexes;
- using Selector = std::integral_constant < Common::GatherScatterImplementation,
- #ifdef Vc_USE_SET_GATHERS
- Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
- #endif
- #ifdef Vc_USE_BSF_GATHERS
- Common::GatherScatterImplementation::BitScanLoop
- #elif defined Vc_USE_POPCNT_BSF_GATHERS
- Common::GatherScatterImplementation::PopcntSwitch
- #else
- Common::GatherScatterImplementation::SimpleLoop
- #endif
- > ;
- Common::executeGather(Selector(), *this, mem, indexes, mask);
- }
- template <typename T>
- template <typename MT, typename IT>
- inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes) const
- {
- Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
- }
- template <typename T>
- template <typename MT, typename IT>
- inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
- {
- using Selector = std::integral_constant < Common::GatherScatterImplementation,
- #ifdef Vc_USE_SET_GATHERS
- Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
- #endif
- #ifdef Vc_USE_BSF_GATHERS
- Common::GatherScatterImplementation::BitScanLoop
- #elif defined Vc_USE_POPCNT_BSF_GATHERS
- Common::GatherScatterImplementation::PopcntSwitch
- #else
- Common::GatherScatterImplementation::SimpleLoop
- #endif
- > ;
- Common::executeScatter(Selector(), *this, mem, std::forward<IT>(indexes), mask);
- }
- #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
- {
- return VectorType(-d.builtin());
- }
- #else
- template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
- {
- return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
- }
- #endif
- template <typename T>
- Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
- Vector<T, VectorAbi::Avx>::minIndex() const
- {
- AVX2::Vector<T> x = min();
- return std::make_pair(x, (*this == x).firstOne());
- }
- template <typename T>
- Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
- Vector<T, VectorAbi::Avx>::maxIndex() const
- {
- AVX2::Vector<T> x = max();
- return std::make_pair(x, (*this == x).firstOne());
- }
- template <> Vc_INTRINSIC std::pair<AVX2::float_v, int> AVX2::float_v::minIndex() const
- {
- __m256 x = d.v();
- __m256 idx = Vector<float>::IndexesFromZero().data();
- __m256 y = Mem::permute128<X1, X0>(x);
- __m256 idy = Mem::permute128<X1, X0>(idx);
- __m256 less = AVX::cmplt_ps(x, y);
- x = _mm256_blendv_ps(y, x, less);
- idx = _mm256_blendv_ps(idy, idx, less);
- y = Reg::permute<X2, X3, X0, X1>(x);
- idy = Reg::permute<X2, X3, X0, X1>(idx);
- less = AVX::cmplt_ps(x, y);
- x = _mm256_blendv_ps(y, x, less);
- idx = _mm256_blendv_ps(idy, idx, less);
- y = Reg::permute<X1, X0, X3, X2>(x);
- idy = Reg::permute<X1, X0, X3, X2>(idx);
- less = AVX::cmplt_ps(x, y);
- idx = _mm256_blendv_ps(idy, idx, less);
- const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx));
- #ifdef Vc_GNU_ASM
- __asm__ __volatile__("");
- #endif
- x = _mm256_blendv_ps(y, x, less);
- return std::make_pair(x, index);
- }
- template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum() const
- {
- AVX2::Vector<T> tmp = *this;
- if (Size > 1) tmp += tmp.shifted(-1);
- if (Size > 2) tmp += tmp.shifted(-2);
- if (Size > 4) tmp += tmp.shifted(-4);
- if (Size > 8) tmp += tmp.shifted(-8);
- if (Size > 16) tmp += tmp.shifted(-16);
- return tmp;
- }
- template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::min(MaskArgument m) const
- {
- AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::max();
- tmp(m) = *this;
- return tmp.min();
- }
- template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::max(MaskArgument m) const
- {
- AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::min();
- tmp(m) = *this;
- return tmp.max();
- }
- template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::product(MaskArgument m) const
- {
- AVX2::Vector<T> tmp(Vc::One);
- tmp(m) = *this;
- return tmp.product();
- }
- template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::sum(MaskArgument m) const
- {
- AVX2::Vector<T> tmp(Vc::Zero);
- tmp(m) = *this;
- return tmp.sum();
- }
- namespace Detail
- {
- Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v)
- {
- using namespace AVX;
- __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23);
- __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23);
- tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
- tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
- return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
- }
- Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v)
- {
- using namespace AVX;
- __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52);
- __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52);
- tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
- tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
- return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
- }
- }
- Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x)
- {
- using Detail::operator>=;
- Vc_ASSERT((x >= x.Zero()).isFull());
- return Detail::exponent(x.data());
- }
- Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x)
- {
- using Detail::operator>=;
- Vc_ASSERT((x >= x.Zero()).isFull());
- return Detail::exponent(x.data());
- }
- static Vc_ALWAYS_INLINE __m256i _doRandomStep()
- {
- using Detail::operator*;
- using Detail::operator+;
- #ifdef Vc_IMPL_AVX2
- using AVX2::uint_v;
- uint_v state0(&Common::RandomState[0]);
- uint_v state1(&Common::RandomState[uint_v::Size]);
- (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
- uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
- _mm256_srli_epi32(state1.data(), 16)))
- .store(&Common::RandomState[0]);
- return state0.data();
- #else
- using SSE::uint_v;
- uint_v state0(&Common::RandomState[0]);
- uint_v state1(&Common::RandomState[uint_v::Size]);
- uint_v state2(&Common::RandomState[2 * uint_v::Size]);
- uint_v state3(&Common::RandomState[3 * uint_v::Size]);
- (state2 * uint_v(0xdeece66du) + uint_v(11))
- .store(&Common::RandomState[2 * uint_v::Size]);
- (state3 * uint_v(0xdeece66du) + uint_v(11))
- .store(&Common::RandomState[3 * uint_v::Size]);
- uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
- _mm_srli_epi32(state2.data(), 16)))
- .store(&Common::RandomState[0]);
- uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(),
- _mm_srli_epi32(state3.data(), 16)))
- .store(&Common::RandomState[uint_v::Size]);
- return AVX::concat(state0.data(), state1.data());
- #endif
- }
- #ifdef Vc_IMPL_AVX2
- template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::Random()
- {
- return {_doRandomStep()};
- }
- #endif
- template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random()
- {
- return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()),
- HT::one());
- }
- template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random()
- {
- const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned,
- Detail::LoadTag<__m256i, int>());
- for (size_t k = 0; k < 8; k += 2) {
- typedef unsigned long long uint64 Vc_MAY_ALIAS;
- const uint64 stateX = *aliasing_cast<uint64>(&Common::RandomState[k]);
- *aliasing_cast<uint64>(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11);
- }
- return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one());
- }
- template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount) const
- {
- return Detail::shifted<EntryType>(d.v(), amount);
- }
- template <typename VectorType>
- Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>)
- {
- return Mem::shuffle<X2, X3, Y0, Y1>(left, right);
- }
- template <typename VectorType>
- Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>)
- {
- return Mem::shuffle128<X1, Y0>(left, right);
- }
- template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount, Vector shiftIn) const
- {
- #ifdef __GNUC__
- if (__builtin_constant_p(amount)) {
- const __m256i a = AVX::avx_cast<__m256i>(d.v());
- const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v());
- if (amount * 2 == int(Size)) {
- return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT());
- }
- if (amount * 2 == -int(Size)) {
- return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT());
- }
- switch (amount) {
- case 1:
- return AVX::avx_cast<VectorType>(
- #ifdef Vc_IMPL_AVX2
- _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
- sizeof(EntryType))
- #else
- AVX::concat(
- _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)),
- _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType)))
- #endif
- );
- case 2:
- return AVX::avx_cast<VectorType>(
- #ifdef Vc_IMPL_AVX2
- _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
- 2 * sizeof(EntryType))
- #else
- AVX::concat(
- _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)),
- _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType)))
- #endif
- );
- case 3:
- if (6u < Size) {
- return AVX::avx_cast<VectorType>(
- #ifdef Vc_IMPL_AVX2
- _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
- 3 * sizeof(EntryType))
- #else
- AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a),
- 3 * sizeof(EntryType)),
- _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a),
- 3 * sizeof(EntryType)))
- #endif
- );
- }
- }
- }
- #endif
- using Detail::operator|;
- return shifted(amount) | (amount > 0 ?
- shiftIn.shifted(amount - Size) :
- shiftIn.shifted(Size + amount));
- }
- template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::rotated(int amount) const
- {
- return Detail::rotated<EntryType, size()>(d.v(), amount);
- }
- template <typename T>
- Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::sorted()
- const
- {
- return Detail::sorted(*this);
- }
- template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const
- {
- return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_pd(data(), x.data()),
- _mm256_unpackhi_pd(data(), x.data()));
- }
- template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const
- {
- return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_pd(data(), x.data()),
- _mm256_unpackhi_pd(data(), x.data()));
- }
- template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const
- {
- return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_ps(data(), x.data()),
- _mm256_unpackhi_ps(data(), x.data()));
- }
- template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const
- {
- return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_ps(data(), x.data()),
- _mm256_unpackhi_ps(data(), x.data()));
- }
- #ifdef Vc_IMPL_AVX2
- template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveLow ( AVX2::int_v x) const {
- return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
- _mm256_unpackhi_epi32(data(), x.data()));
- }
- template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveHigh( AVX2::int_v x) const {
- return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
- _mm256_unpackhi_epi32(data(), x.data()));
- }
- template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveLow ( AVX2::uint_v x) const {
- return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
- _mm256_unpackhi_epi32(data(), x.data()));
- }
- template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveHigh( AVX2::uint_v x) const {
- return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
- _mm256_unpackhi_epi32(data(), x.data()));
- }
- template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveLow ( AVX2::short_v x) const {
- return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
- _mm256_unpackhi_epi16(data(), x.data()));
- }
- template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveHigh( AVX2::short_v x) const {
- return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
- _mm256_unpackhi_epi16(data(), x.data()));
- }
- template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const {
- return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
- _mm256_unpackhi_epi16(data(), x.data()));
- }
- template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const {
- return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
- _mm256_unpackhi_epi16(data(), x.data()));
- }
- #endif
- template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const
- {
- return Mem::permute128<X1, X0>(Mem::permute<X1, X0, X3, X2>(d.v()));
- }
- template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const
- {
- return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
- }
- #ifdef Vc_IMPL_AVX2
- template <>
- Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const
- {
- return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
- }
- template <>
- Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const
- {
- return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
- }
- template <>
- Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[](
- Permutation::ReversedTag) const
- {
- return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
- AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
- AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
- }
- template <>
- Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[](
- Permutation::ReversedTag) const
- {
- return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
- AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
- AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
- }
- #endif
- template <> Vc_INTRINSIC AVX2::float_v Vector<float, VectorAbi::Avx>::operator[](const IndexType & ) const
- {
- return *this;
- #ifdef Vc_IMPL_AVX2
- #else
- #endif
- }
- template <typename T>
- Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::reversed() const
- {
- return (*this)[Permutation::Reversed];
- }
- template <> template <int Index> Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const
- {
- constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
- constexpr VecPos Outer = static_cast<VecPos>((Index & 0x4) / 4);
- return Mem::permute<Inner, Inner, Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
- }
- template <> template <int Index> Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const
- {
- constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
- constexpr VecPos Outer = static_cast<VecPos>((Index & 0x2) / 2);
- return Mem::permute<Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
- }
- }
- #ifndef VC_AVX_SIMD_CAST_H_
- #define VC_AVX_SIMD_CAST_H_
- #ifndef VC_AVX_VECTOR_H_
- #error "Vc/avx/vector.h needs to be included before Vc/avx/simd_cast.h"
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- #define Vc_SIMD_CAST_AVX_1(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- AVX2::from_ x, enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
- #define Vc_SIMD_CAST_AVX_2(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- AVX2::from_ x0, AVX2::from_ x1, \
- enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
- #define Vc_SIMD_CAST_AVX_3(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \
- enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
- #define Vc_SIMD_CAST_AVX_4(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, AVX2::from_ x3, \
- enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
- #define Vc_SIMD_CAST_1(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x, enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_2(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x0, from_ x1, enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_3(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x0, from_ x1, from_ x2, enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_4(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x0, from_ x1, from_ x2, from_ x3, \
- enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_5(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
- enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_6(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, \
- enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_7(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, \
- enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_8(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7, \
- enable_if<std::is_same<To, to_>::value> = nullarg)
- #define Vc_SIMD_CAST_OFFSET(from_,to_,offset_) \
- static_assert(from_::size() >= to_::size() * (offset_ + 1), \
- "this offset cannot exist for this type combination"); \
- template <typename To, int offset> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x, \
- enable_if<(offset == offset_ && std::is_same<To, to_>::value)> = nullarg)
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(From x, enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
- SSE::Vector<typename To::EntryType>::Size == To::Size)> =
- nullarg);
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To simd_cast(
- From x0, From x1,
- enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
- SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To simd_cast(
- From x0, From x1, From x2,
- enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
- SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To simd_cast(
- From x0, From x1, From x2, From x3,
- enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
- SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To simd_cast(
- From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7,
- enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
- SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
- Vc_SIMD_CAST_AVX_1( float_v, double_v);
- Vc_SIMD_CAST_AVX_1(double_v, float_v);
- Vc_SIMD_CAST_AVX_2(double_v, float_v);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_1( int_v, double_v);
- Vc_SIMD_CAST_AVX_1( uint_v, double_v);
- Vc_SIMD_CAST_AVX_1( short_v, double_v);
- Vc_SIMD_CAST_AVX_1(ushort_v, double_v);
- Vc_SIMD_CAST_AVX_1( int_v, float_v);
- Vc_SIMD_CAST_AVX_1( uint_v, float_v);
- Vc_SIMD_CAST_AVX_1( short_v, float_v);
- Vc_SIMD_CAST_AVX_1(ushort_v, float_v);
- Vc_SIMD_CAST_AVX_1(double_v, int_v);
- Vc_SIMD_CAST_AVX_1( float_v, int_v);
- Vc_SIMD_CAST_AVX_1( uint_v, int_v);
- Vc_SIMD_CAST_AVX_1( short_v, int_v);
- Vc_SIMD_CAST_AVX_1(ushort_v, int_v);
- Vc_SIMD_CAST_AVX_2(double_v, int_v);
- Vc_SIMD_CAST_AVX_1(double_v, uint_v);
- Vc_SIMD_CAST_AVX_1( float_v, uint_v);
- Vc_SIMD_CAST_AVX_1( int_v, uint_v);
- Vc_SIMD_CAST_AVX_1( short_v, uint_v);
- Vc_SIMD_CAST_AVX_1(ushort_v, uint_v);
- Vc_SIMD_CAST_AVX_2(double_v, uint_v);
- Vc_SIMD_CAST_AVX_1(double_v, short_v);
- Vc_SIMD_CAST_AVX_1( float_v, short_v);
- Vc_SIMD_CAST_AVX_1( int_v, short_v);
- Vc_SIMD_CAST_AVX_1( uint_v, short_v);
- Vc_SIMD_CAST_AVX_1(ushort_v, short_v);
- Vc_SIMD_CAST_AVX_2(double_v, short_v);
- Vc_SIMD_CAST_AVX_2( float_v, short_v);
- Vc_SIMD_CAST_AVX_2( int_v, short_v);
- Vc_SIMD_CAST_AVX_2( uint_v, short_v);
- Vc_SIMD_CAST_AVX_3(double_v, short_v);
- Vc_SIMD_CAST_AVX_4(double_v, short_v);
- Vc_SIMD_CAST_AVX_1(double_v, ushort_v);
- Vc_SIMD_CAST_AVX_1( float_v, ushort_v);
- Vc_SIMD_CAST_AVX_1( int_v, ushort_v);
- Vc_SIMD_CAST_AVX_1( uint_v, ushort_v);
- Vc_SIMD_CAST_AVX_1( short_v, ushort_v);
- Vc_SIMD_CAST_AVX_2(double_v, ushort_v);
- Vc_SIMD_CAST_AVX_2( float_v, ushort_v);
- Vc_SIMD_CAST_AVX_2( int_v, ushort_v);
- Vc_SIMD_CAST_AVX_2( uint_v, ushort_v);
- Vc_SIMD_CAST_AVX_3(double_v, ushort_v);
- Vc_SIMD_CAST_AVX_4(double_v, ushort_v);
- #endif
- Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v);
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v);
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v);
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v);
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v);
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v);
- Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v);
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v);
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v);
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v);
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v);
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v);
- Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v);
- Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v);
- Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v);
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v);
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v);
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v);
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v);
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v);
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v);
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v);
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v);
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v);
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v);
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v);
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v);
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v);
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v);
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v);
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v);
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v);
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v);
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v);
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v);
- #endif
- Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v);
- Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v);
- Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v);
- Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v);
- Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v);
- Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v);
- Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v);
- Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v);
- Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v);
- Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v);
- Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v);
- Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v);
- Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v);
- Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v);
- Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v);
- Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v);
- Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v);
- Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v);
- Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v);
- Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v);
- Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v);
- Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v);
- Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v);
- Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v);
- #endif
- Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v);
- Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v);
- Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v);
- Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v);
- Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v);
- Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v);
- Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v);
- Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v);
- Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v);
- Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v);
- #endif
- Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v);
- Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v);
- Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v);
- Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v);
- Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v);
- Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v);
- Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v);
- Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v);
- Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v);
- Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v);
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v);
- Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v);
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v);
- Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v);
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v);
- Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v);
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v);
- Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v);
- #endif
- Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v);
- Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v);
- Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v);
- Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v);
- Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v);
- Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v);
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v);
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v);
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v);
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v);
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v);
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v);
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v);
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v);
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v);
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v);
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v);
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v);
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v);
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v);
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v);
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v);
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v);
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v);
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v);
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v);
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v);
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v);
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v);
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v);
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v);
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v);
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v);
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v);
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v);
- #endif
- Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v);
- Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
- Scalar::Vector<T> x15,
- enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
- Scalar::Vector<T> x15,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
- #endif
- template <typename To, typename FromT>
- Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Vector<FromT> x,
- enable_if<Scalar::is_vector<To>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(const AVX2::Mask<T> &k, enable_if<AVX2::is_mask<Return>::value> = nullarg);
- Vc_SIMD_CAST_AVX_2(double_m, float_m);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_2(double_m, int_m);
- Vc_SIMD_CAST_AVX_2(double_m, uint_m);
- Vc_SIMD_CAST_AVX_2(double_m, short_m);
- Vc_SIMD_CAST_AVX_2(double_m, ushort_m);
- Vc_SIMD_CAST_AVX_2( float_m, short_m);
- Vc_SIMD_CAST_AVX_2( float_m, ushort_m);
- Vc_SIMD_CAST_AVX_2( int_m, short_m);
- Vc_SIMD_CAST_AVX_2( int_m, ushort_m);
- Vc_SIMD_CAST_AVX_2( uint_m, short_m);
- Vc_SIMD_CAST_AVX_2( uint_m, ushort_m);
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_4(double_m, short_m);
- Vc_SIMD_CAST_AVX_4(double_m, ushort_m);
- #endif
- Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m);
- Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m);
- Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m);
- Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m);
- Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m);
- #endif
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m);
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m);
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m);
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m);
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m);
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m);
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m);
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m);
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m);
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m);
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m);
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m);
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m);
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m);
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m);
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m);
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m);
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m);
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m);
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m);
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m);
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m);
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m);
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m);
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m);
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m);
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m);
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m);
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m);
- #endif
- Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m);
- Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m);
- Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m);
- Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m);
- Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m);
- Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m);
- Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m);
- Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m);
- Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m);
- Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m);
- Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m);
- Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m);
- Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m);
- Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m);
- Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m);
- Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m);
- Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m);
- Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m);
- Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m);
- Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m);
- Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m);
- Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m);
- Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m);
- Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m);
- #endif
- Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m);
- Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m);
- Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m);
- Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m);
- Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m);
- Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m);
- Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m);
- Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m);
- Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m);
- Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m);
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Mask<T> k,
- enable_if<AVX2::is_mask<Return>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1,
- enable_if<AVX2::is_mask<Return>::value> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast(
- Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
- enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 4)> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast(
- Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
- Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
- enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 8)> = nullarg);
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
- Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
- Scalar::Mask<T> k8, Scalar::Mask<T> k9, Scalar::Mask<T> k10,
- Scalar::Mask<T> k11, Scalar::Mask<T> k12, Scalar::Mask<T> k13,
- Scalar::Mask<T> k14, Scalar::Mask<T> k15,
- enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 16)> = nullarg);
- Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m);
- Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m);
- Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m);
- Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m);
- Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m);
- Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m);
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m);
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m);
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m);
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m);
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m);
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m);
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m);
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m);
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m);
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m);
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m);
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m);
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m);
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m);
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m);
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m);
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m);
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m);
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m);
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m);
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m);
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m);
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m);
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m);
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m);
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m);
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m);
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m);
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m);
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m);
- #endif
- Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m);
- Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m);
- template <typename To, typename FromT>
- Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Mask<FromT> x,
- enable_if<Scalar::is_mask<To>::value> = nullarg);
- template <typename Return, int offset, typename From>
- Vc_INTRINSIC Vc_CONST enable_if<
- (offset == 0 &&
- ((AVX2::is_vector<From>::value && !Scalar::is_vector<Return>::value &&
- Traits::is_simd_vector<Return>::value && !Traits::isSimdArray<Return>::value) ||
- (AVX2::is_mask<From>::value && !Scalar::is_mask<Return>::value &&
- Traits::is_simd_mask<Return>::value &&
- !Traits::isSimdMaskArray<Return>::value))),
- Return>
- simd_cast(const From &x);
- template <typename Return, int offset, typename From>
- Vc_INTRINSIC Vc_CONST Return simd_cast(
- const From &x,
- enable_if<offset == 0 && ((SSE::is_vector<From>::value &&
- AVX2::is_vector<Return>::value) ||
- (SSE::is_mask<From>::value &&
- AVX2::is_mask<Return>::value))> = nullarg);
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector<Return>::value && offset != 0),
- Return>
- simd_cast(AVX2::Vector<T> x);
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
- sizeof(AVX2::Vector<T>) == 32),
- Return>
- simd_cast(AVX2::Vector<T> x);
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
- sizeof(AVX2::Vector<T>) == 16),
- Return>
- simd_cast(AVX2::Vector<T> x);
- Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1);
- Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1);
- Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1);
- Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1);
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
- sizeof(AVX2::Mask<T>) == 32),
- Return>
- simd_cast(AVX2::Mask<T> x);
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
- sizeof(AVX2::Mask<T>) == 16),
- Return>
- simd_cast(AVX2::Mask<T> x);
- #undef Vc_SIMD_CAST_AVX_1
- #define Vc_SIMD_CAST_AVX_1(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x, \
- enable_if<std::is_same<To, AVX2::to_>::value>)
- #undef Vc_SIMD_CAST_AVX_2
- #define Vc_SIMD_CAST_AVX_2(from_,to_) \
- static_assert(AVX2::from_::size() * 2 <= AVX2::to_::size(), \
- "this type combination is wrong"); \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, \
- enable_if<std::is_same<To, AVX2::to_>::value>)
- #undef Vc_SIMD_CAST_AVX_3
- #define Vc_SIMD_CAST_AVX_3(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \
- enable_if<std::is_same<To, AVX2::to_>::value>)
- #undef Vc_SIMD_CAST_AVX_4
- #define Vc_SIMD_CAST_AVX_4(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \
- AVX2::from_ x3, \
- enable_if<std::is_same<To, AVX2::to_>::value>)
- #undef Vc_SIMD_CAST_1
- #define Vc_SIMD_CAST_1(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if<std::is_same<To, to_>::value>)
- #undef Vc_SIMD_CAST_2
- #define Vc_SIMD_CAST_2(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, \
- enable_if<std::is_same<To, to_>::value>)
- #undef Vc_SIMD_CAST_3
- #define Vc_SIMD_CAST_3(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, \
- enable_if<std::is_same<To, to_>::value>)
- #undef Vc_SIMD_CAST_4
- #define Vc_SIMD_CAST_4(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, \
- enable_if<std::is_same<To, to_>::value>)
- #undef Vc_SIMD_CAST_5
- #define Vc_SIMD_CAST_5(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
- enable_if<std::is_same<To, to_>::value>)
- #undef Vc_SIMD_CAST_6
- #define Vc_SIMD_CAST_6(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
- from_ x5, \
- enable_if<std::is_same<To, to_>::value>)
- #undef Vc_SIMD_CAST_7
- #define Vc_SIMD_CAST_7(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
- from_ x5, from_ x6, \
- enable_if<std::is_same<To, to_>::value>)
- #undef Vc_SIMD_CAST_8
- #define Vc_SIMD_CAST_8(from_,to_) \
- template <typename To> \
- Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
- from_ x5, from_ x6, from_ x7, \
- enable_if<std::is_same<To, to_>::value>)
- #undef Vc_SIMD_CAST_OFFSET
- #define Vc_SIMD_CAST_OFFSET(from_,to_,offset_) \
- static_assert(from_::size() >= to_::size() * (offset_ + 1), \
- "this offset cannot exist for this type combination"); \
- template <typename To, int offset> \
- Vc_INTRINSIC Vc_CONST To simd_cast( \
- from_ x, enable_if<(offset == offset_ && std::is_same<To, to_>::value)>)
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(From x, enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
- SSE::Vector<typename To::EntryType>::Size == To::Size)>)
- {
- return simd_cast<SSE::Vector<typename To::EntryType>>(x).data();
- }
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(From x0, From x1,
- enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
- SSE::Vector<typename To::EntryType>::Size == To::Size)>)
- {
- return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1).data();
- }
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(From x0, From x1, From x2,
- enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
- SSE::Vector<typename To::EntryType>::Size == To::Size)>)
- {
- return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1, x2).data();
- }
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(From x0, From x1, From x2, From x3,
- enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
- SSE::Vector<typename To::EntryType>::Size == To::Size)>)
- {
- return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1, x2, x3).data();
- }
- template <typename To, typename From>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7,
- enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
- SSE::Vector<typename To::EntryType>::Size == To::Size)>)
- {
- return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1, x2, x3, x4, x5, x6, x7)
- .data();
- }
- Vc_SIMD_CAST_AVX_1( float_v, double_v) { return _mm256_cvtps_pd(AVX::lo128(x.data())); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_1( int_v, double_v) { return AVX::convert< int, double>(AVX::lo128(x.data())); }
- Vc_SIMD_CAST_AVX_1( uint_v, double_v) { return AVX::convert< uint, double>(AVX::lo128(x.data())); }
- Vc_SIMD_CAST_AVX_1( short_v, double_v) { return AVX::convert< short, double>(AVX::lo128(x.data())); }
- Vc_SIMD_CAST_AVX_1(ushort_v, double_v) { return AVX::convert<ushort, double>(AVX::lo128(x.data())); }
- #endif
- Vc_SIMD_CAST_AVX_1(double_v, float_v) { return AVX::zeroExtend(_mm256_cvtpd_ps(x.data())); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_1( int_v, float_v) { return AVX::convert< int, float>(x.data()); }
- Vc_SIMD_CAST_AVX_1( uint_v, float_v) { return AVX::convert< uint, float>(x.data()); }
- Vc_SIMD_CAST_AVX_1( short_v, float_v) { return AVX::convert< short, float>(AVX::lo128(x.data())); }
- Vc_SIMD_CAST_AVX_1(ushort_v, float_v) { return AVX::convert<ushort, float>(AVX::lo128(x.data())); }
- #endif
- Vc_SIMD_CAST_AVX_2(double_v, float_v) { return AVX::concat(_mm256_cvtpd_ps(x0.data()), _mm256_cvtpd_ps(x1.data())); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_1(double_v, int_v) { return AVX::zeroExtend(_mm256_cvttpd_epi32(x.data())); }
- Vc_SIMD_CAST_AVX_1( float_v, int_v) { return _mm256_cvttps_epi32(x.data()); }
- Vc_SIMD_CAST_AVX_1( uint_v, int_v) { return x.data(); }
- Vc_SIMD_CAST_AVX_1( short_v, int_v) { return _mm256_cvtepi16_epi32(AVX::lo128(x.data())); }
- Vc_SIMD_CAST_AVX_1(ushort_v, int_v) { return _mm256_cvtepu16_epi32(AVX::lo128(x.data())); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_2(double_v, int_v) { return AVX::concat(_mm256_cvttpd_epi32(x0.data()), _mm256_cvttpd_epi32(x1.data())); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_1(double_v, uint_v) { return AVX::zeroExtend(AVX::convert<double, uint>(x.data())); }
- Vc_SIMD_CAST_AVX_1( float_v, uint_v) {
- return _mm256_blendv_epi8(
- _mm256_cvttps_epi32(x.data()),
- _mm256_add_epi32(
- _mm256_cvttps_epi32(_mm256_sub_ps(x.data(), AVX::set2power31_ps())),
- AVX::set2power31_epu32()),
- _mm256_castps_si256(AVX::cmpge_ps(x.data(), AVX::set2power31_ps())));
- }
- Vc_SIMD_CAST_AVX_1( int_v, uint_v) { return x.data(); }
- Vc_SIMD_CAST_AVX_1( short_v, uint_v) { return _mm256_cvtepi16_epi32(AVX::lo128(x.data())); }
- Vc_SIMD_CAST_AVX_1(ushort_v, uint_v) { return _mm256_cvtepu16_epi32(AVX::lo128(x.data())); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_2(double_v, uint_v) { return AVX::concat(AVX::convert<double, uint>(x0.data()), AVX::convert<double, uint>(x1.data())); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_1(double_v, short_v) { return AVX::zeroExtend(_mm_packs_epi32(_mm256_cvttpd_epi32(x.data()), _mm_setzero_si128())); }
- Vc_SIMD_CAST_AVX_1( float_v, short_v) {
- const auto tmp = _mm256_cvttps_epi32(x.data());
- return AVX::zeroExtend(_mm_packs_epi32(AVX::lo128(tmp), AVX::hi128(tmp)));
- }
- Vc_SIMD_CAST_AVX_1( int_v, short_v) { return AVX::zeroExtend(AVX::convert< int, short>(x.data())); }
- Vc_SIMD_CAST_AVX_1( uint_v, short_v) { return AVX::zeroExtend(AVX::convert<uint, short>(x.data())); }
- Vc_SIMD_CAST_AVX_1(ushort_v, short_v) { return x.data(); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_2(double_v, short_v) {
- const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
- const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
- return AVX::zeroExtend(_mm_packs_epi32(tmp0, tmp1));
- }
- Vc_SIMD_CAST_AVX_2( float_v, short_v) {
- using AVX2::short_v;
- using AVX2::int_v;
- return simd_cast<short_v>(simd_cast<int_v>(x0), simd_cast<int_v>(x1));
- }
- Vc_SIMD_CAST_AVX_2( int_v, short_v) {
- const auto shuf = _mm256_setr_epi8(
- 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
- 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
- auto a = _mm256_shuffle_epi8(x0.data(), shuf);
- auto b = _mm256_shuffle_epi8(x1.data(), shuf);
- return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi64(a, b));
- }
- Vc_SIMD_CAST_AVX_2( uint_v, short_v) {
- const auto shuf = _mm256_setr_epi8(
- 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
- 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
- auto a = _mm256_shuffle_epi8(x0.data(), shuf);
- auto b = _mm256_shuffle_epi8(x1.data(), shuf);
- return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi64(a, b));
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_3(double_v, short_v) {
- const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
- const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
- const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
- return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, _mm_setzero_si128()));
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_4(double_v, short_v) {
- const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
- const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
- const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
- const auto tmp3 = _mm256_cvttpd_epi32(x3.data());
- return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, tmp3));
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_1(double_v, ushort_v) {
- const auto tmp = _mm256_cvttpd_epi32(x.data());
- return AVX::zeroExtend(_mm_packus_epi32(tmp, _mm_setzero_si128()));
- }
- Vc_SIMD_CAST_AVX_1( float_v, ushort_v) {
- const auto tmp = _mm256_cvttps_epi32(x.data());
- return AVX::zeroExtend(_mm_packus_epi32(AVX::lo128(tmp), AVX::hi128(tmp)));
- }
- Vc_SIMD_CAST_AVX_1( int_v, ushort_v) { return AVX::zeroExtend(AVX::convert< int, ushort>(x.data())); }
- Vc_SIMD_CAST_AVX_1( uint_v, ushort_v) { return AVX::zeroExtend(AVX::convert<uint, ushort>(x.data())); }
- Vc_SIMD_CAST_AVX_1( short_v, ushort_v) { return x.data(); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_2(double_v, ushort_v) {
- const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
- const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
- return AVX::zeroExtend(_mm_packus_epi32(tmp0, tmp1));
- }
- Vc_SIMD_CAST_AVX_2( float_v, ushort_v) {
- using AVX2::ushort_v;
- using AVX2::int_v;
- return simd_cast<ushort_v>(simd_cast<int_v>(x0), simd_cast<int_v>(x1));
- }
- Vc_SIMD_CAST_AVX_2( int_v, ushort_v) {
- auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data());
- auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data());
- auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1);
- auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1);
- return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi16(tmp2, tmp3));
- }
- Vc_SIMD_CAST_AVX_2( uint_v, ushort_v) {
- auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data());
- auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data());
- auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1);
- auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1);
- return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi16(tmp2, tmp3));
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_3(double_v, ushort_v) {
- const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
- const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
- const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
- return AVX::concat(_mm_packus_epi32(tmp0, tmp1),
- _mm_packus_epi32(tmp2, _mm_setzero_si128()));
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_4(double_v, ushort_v) {
- const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
- const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
- const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
- const auto tmp3 = _mm256_cvttpd_epi32(x3.data());
- return AVX::concat(_mm_packus_epi32(tmp0, tmp1), _mm_packus_epi32(tmp2, tmp3));
- }
- #endif
- Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v) { return _mm256_cvtps_pd(x.data()); }
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v) { return _mm256_cvtepi32_pd(x.data()); }
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v) { using namespace AvxIntrinsics; return _mm256_add_pd(_mm256_cvtepi32_pd(_mm_sub_epi32(x.data(), _mm_setmin_epi32())), set1_pd(1u << 31)); }
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v>(x)); }
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v>(x)); }
- Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast<SSE:: float_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v) { return AVX::zeroExtend(_mm_cvtepi32_ps(x.data())); }
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast<SSE::float_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v) { return AVX::convert< short, float>(x.data()); }
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v) { return AVX::convert<ushort, float>(x.data()); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast<SSE:: int_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast<SSE:: uint_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast<SSE::int_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast<SSE::uint_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE::short_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v) { return AVX::convert< short, int>(x.data()); }
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v) { return AVX::convert<ushort, int>(x.data()); }
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v) { return AVX::convert< short, uint>(x.data()); }
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v) { return AVX::convert<ushort, uint>(x.data()); }
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE::short_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE::short_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); }
- #endif
- Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast<SSE:: float_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v) { return AVX::convert< int, float>(AVX::concat(x0.data(), x1.data())); }
- Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v) { return AVX::convert<uint, float>(AVX::concat(x0.data(), x1.data())); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast<SSE:: int_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast<SSE:: uint_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v) { return simd_cast<AVX2:: int_v>(simd_cast<AVX2::float_v>(x0, x1)); }
- Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v) { return simd_cast<AVX2::uint_v>(simd_cast<AVX2::float_v>(x0, x1)); }
- Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
- Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); }
- #endif
- Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v) { return simd_cast<AVX2:: float_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2)); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v) { return simd_cast<AVX2:: int_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2)); }
- Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v) { return simd_cast<AVX2::uint_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2)); }
- Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1, x2).data()); }
- Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1, x2).data()); }
- Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2)); }
- Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2)); }
- Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2)); }
- Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2)); }
- Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2)); }
- Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2)); }
- #endif
- Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v) { return simd_cast<AVX2:: float_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3)); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v) { return simd_cast<AVX2:: int_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3)); }
- Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v) { return simd_cast<AVX2::uint_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3)); }
- Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1, x2, x3).data()); }
- Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1, x2, x3).data()); }
- Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2, x3)); }
- Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2, x3)); }
- Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2, x3)); }
- Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2, x3)); }
- Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2, x3)); }
- Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2, x3)); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4)); }
- Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4)); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5)); }
- Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5)); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6)); }
- Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6)); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6, x7)); }
- Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6, x7)); }
- #endif
- Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v) { return AVX::lo128(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v) { return AVX::lo128(x.data()); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v) { return AVX::lo128(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v) { return AVX::lo128(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v) { return AVX::lo128(x.data()); }
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v) { return AVX::lo128(x.data()); }
- #endif
- Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v) { return simd_cast<SSE:: float_v>(simd_cast<AVX2:: float_v>(x)); }
- Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v) { return AVX::convert<double, int>(x.data()); }
- Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v) { return AVX::convert<double, unsigned int>(x.data()); }
- Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v) { return AVX::convert<double, short>(x.data()); }
- Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v) { return AVX::convert<double, unsigned short>(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v) { return simd_cast<SSE::double_v>(simd_cast<SSE:: float_v>(x)); }
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v) { return simd_cast<SSE:: int_v>(simd_cast<SSE:: float_v>(x)); }
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v) { return simd_cast<SSE:: uint_v>(simd_cast<SSE:: float_v>(x)); }
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v) { return AVX::convert<float, short>(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v) { return AVX::convert<float, unsigned short>(x.data()); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v) { return SSE::convert<int, double>(AVX::lo128(x.data())); }
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v) { return SSE::convert<int, float>(AVX::lo128(x.data())); }
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v) { return AVX::lo128(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v) { return AVX::convert<int, short>(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v) { return AVX::convert<int, ushort>(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v) { return SSE::convert<uint, double>(AVX::lo128(x.data())); }
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v) { return SSE::convert<uint, float>(AVX::lo128(x.data())); }
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v) { return AVX::lo128(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v) { return AVX::convert<uint, short>(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v) { return AVX::convert<uint, ushort>(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v) { return simd_cast<SSE::double_v>(simd_cast<SSE:: short_v>(x)); }
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v) { return simd_cast<SSE:: float_v>(simd_cast<SSE:: short_v>(x)); }
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v) { return simd_cast<SSE:: int_v>(simd_cast<SSE:: short_v>(x)); }
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v) { return simd_cast<SSE:: uint_v>(simd_cast<SSE:: short_v>(x)); }
- Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE:: short_v>(x)); }
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v) { return simd_cast<SSE::double_v>(simd_cast<SSE::ushort_v>(x)); }
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v) { return simd_cast<SSE:: float_v>(simd_cast<SSE::ushort_v>(x)); }
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v) { return simd_cast<SSE:: int_v>(simd_cast<SSE::ushort_v>(x)); }
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v) { return simd_cast<SSE:: uint_v>(simd_cast<SSE::ushort_v>(x)); }
- Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v) { return simd_cast<SSE:: short_v>(simd_cast<SSE::ushort_v>(x)); }
- #endif
- Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v) {
- const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
- const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
- return _mm_packs_epi32(tmp0, tmp1);
- }
- Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v) {
- const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
- const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
- return _mm_packus_epi32(tmp0, tmp1);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::double_v>::value>)
- {
- return AVX::zeroExtend(_mm_setr_pd(x.data(), 0.));
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::float_v>::value>)
- {
- return AVX::zeroExtend(_mm_setr_ps(x.data(), 0.f, 0.f, 0.f));
- }
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::int_v>::value>)
- {
- return _mm256_setr_epi32(x.data(), 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::uint_v>::value>)
- {
- return _mm256_setr_epi32(uint(x.data()), 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::double_v>::value>)
- {
- return AVX::zeroExtend(_mm_setr_pd(x0.data(), x1.data()));
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::float_v>::value>)
- {
- return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f));
- }
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::int_v>::value>)
- {
- return _mm256_setr_epi32(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::uint_v>::value>)
- {
- return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::double_v>::value>)
- {
- return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::float_v>::value>)
- {
- return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), 0));
- }
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::int_v>::value>)
- {
- return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::uint_v>::value>)
- {
- return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), 0, 0, 0,
- 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::double_v>::value>)
- {
- return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), x3.data());
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::float_v>::value>)
- {
- return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), x3.data()));
- }
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::int_v>::value>)
- {
- return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::uint_v>::value>)
- {
- return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
- uint(x3.data()), 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, AVX2::float_v>::value>)
- {
- return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
- }
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, AVX2::int_v>::value>)
- {
- return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, AVX2::uint_v>::value>)
- {
- return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
- uint(x3.data()), uint(x4.data()), 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, AVX2::float_v>::value>)
- {
- return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), 0, 0);
- }
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, AVX2::int_v>::value>)
- {
- return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, AVX2::uint_v>::value>)
- {
- return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
- uint(x3.data()), uint(x4.data()), uint(x5.data()), 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, AVX2::float_v>::value>)
- {
- return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), 0);
- }
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, AVX2::int_v>::value>)
- {
- return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, AVX2::uint_v>::value>)
- {
- return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
- uint(x3.data()), uint(x4.data()), uint(x5.data()),
- uint(x6.data()), 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0);
- }
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, AVX2::float_v>::value>)
- {
- return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data());
- }
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, AVX2::int_v>::value>)
- {
- return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data());
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, AVX2::uint_v>::value>)
- {
- return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
- uint(x3.data()), uint(x4.data()), uint(x5.data()),
- uint(x6.data()), uint(x7.data()));
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0);
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0,
- 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0,
- 0);
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0,
- 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0,
- 0, 0, 0, 0);
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), 0, 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), 0, 0, 0, 0, 0);
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), x11.data(), 0, 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), x11.data(), 0, 0, 0, 0);
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), x11.data(), x12.data(), 0, 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), x11.data(), x12.data(), 0, 0, 0);
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), x11.data(), x12.data(), x13.data(), 0, 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), x11.data(), x12.data(), x13.data(), 0, 0);
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
- enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
- 0);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
- enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
- 0);
- }
- #endif
- #ifdef Vc_IMPL_AVX2
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
- Scalar::Vector<T> x15, enable_if<std::is_same<Return, AVX2::short_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
- x15.data());
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
- Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
- Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
- Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
- Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
- Scalar::Vector<T> x15, enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
- {
- return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
- x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
- x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
- x15.data());
- }
- #endif
- template <typename To, typename FromT>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(AVX2::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value>)
- {
- return static_cast<To>(x[0]);
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(const AVX2::Mask<T> &k, enable_if<AVX2::is_mask<Return>::value>)
- {
- return {Detail::mask_cast<Mask<T, VectorAbi::Avx>::Size, Return::Size,
- typename Return::VectorTypeF>(k.dataI())};
- }
- Vc_SIMD_CAST_AVX_2(double_m, float_m) { return AVX::concat(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_2(double_m, int_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi32(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_AVX_2(double_m, uint_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi32(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_AVX_2(double_m, short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); }
- Vc_SIMD_CAST_AVX_2(double_m, ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); }
- Vc_SIMD_CAST_AVX_2( float_m, short_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_AVX_2( float_m, ushort_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_AVX_2( int_m, short_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_AVX_2( int_m, ushort_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_AVX_2( uint_m, short_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_AVX_2( uint_m, ushort_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
- #endif
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_AVX_4(double_m, short_m)
- {
- using namespace AVX;
- const auto tmp = _mm256_packs_epi32(
- _mm256_packs_epi32(x0.dataI(), x1.dataI())
- ,
- _mm256_packs_epi32(x2.dataI(), x3.dataI())
- );
- return concat(_mm_unpacklo_epi32(lo128(tmp), hi128(tmp)),
- _mm_unpackhi_epi32(lo128(tmp), hi128(tmp)));
- }
- Vc_SIMD_CAST_AVX_4(double_m, ushort_m) { return simd_cast<AVX2::short_m>(x0, x1, x2, x3).data(); }
- #endif
- Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(simd_cast<SSE:: float_m>(x).data()); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(simd_cast<SSE:: int_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(simd_cast<SSE:: uint_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
- #endif
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); }
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); }
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); }
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); }
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); }
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); }
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); }
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); }
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
- Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
- Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
- #endif
- Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); }
- Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); }
- Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
- Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); }
- Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); }
- #endif
- Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); }
- Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); }
- Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); }
- Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); }
- Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
- Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
- Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
- Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
- Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
- Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
- #endif
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Mask<T> k, enable_if<AVX2::is_mask<Return>::value>)
- {
- Return r{false};
- r[0] = k.data();
- return r;
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1,
- enable_if<AVX2::is_mask<Return>::value>)
- {
- Return r{false};
- r[0] = k0.data();
- r[1] = k1.data();
- return r;
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
- enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 4)>)
- {
- Return r{false};
- r[0] = k0.data();
- r[1] = k1.data();
- r[2] = k2.data();
- r[3] = k3.data();
- return r;
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
- Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
- enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 8)>)
- {
- Return r{false};
- r[0] = k0.data();
- r[1] = k1.data();
- r[2] = k2.data();
- r[3] = k3.data();
- r[4] = k4.data();
- r[5] = k5.data();
- r[6] = k6.data();
- r[7] = k7.data();
- return r;
- }
- template <typename Return, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
- Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
- Scalar::Mask<T> k8, Scalar::Mask<T> k9, Scalar::Mask<T> k10,
- Scalar::Mask<T> k11, Scalar::Mask<T> k12, Scalar::Mask<T> k13,
- Scalar::Mask<T> k14, Scalar::Mask<T> k15,
- enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 16)>)
- {
- Return r{false};
- r[0] = k0.data();
- r[1] = k1.data();
- r[2] = k2.data();
- r[3] = k3.data();
- r[4] = k4.data();
- r[5] = k5.data();
- r[6] = k6.data();
- r[7] = k7.data();
- r[8] = k8.data();
- r[9] = k9.data();
- r[10] = k10.data();
- r[11] = k11.data();
- r[12] = k12.data();
- r[13] = k13.data();
- r[14] = k14.data();
- r[15] = k15.data();
- return r;
- }
- Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m) { return AVX::lo128(x.data()); }
- Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
- Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
- Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
- Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); }
- Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); }
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m) { return _mm_unpacklo_ps(AVX::lo128(x.data()), AVX::lo128(x.data())); }
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m) { return AVX::lo128(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m) { return AVX::lo128(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m) { return AVX::lo128(x.data()); }
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
- Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
- #ifdef Vc_IMPL_AVX2
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); }
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m) { return AVX::lo128(x.dataI()); }
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m) { return AVX::lo128(x.dataI()); }
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); }
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
- Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); }
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m) { return AVX::lo128(x.dataI()); }
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m) { return AVX::lo128(x.dataI()); }
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); }
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
- Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m) { return simd_cast<SSE::double_m>(SSE::short_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m) { return simd_cast<SSE:: float_m>(SSE::short_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m) { return simd_cast<SSE:: int_m>(SSE::short_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m) { return simd_cast<SSE:: uint_m>(SSE::short_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m) { return simd_cast<SSE:: short_m>(SSE::short_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m) { return simd_cast<SSE::ushort_m>(SSE::short_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m) { return simd_cast<SSE::double_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m) { return simd_cast<SSE:: float_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m) { return simd_cast<SSE:: int_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m) { return simd_cast<SSE:: uint_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m) { return simd_cast<SSE:: short_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
- Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m) { return simd_cast<SSE::ushort_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
- #endif
- Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); }
- Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); }
- template <typename To, typename FromT>
- Vc_INTRINSIC Vc_CONST To
- simd_cast(AVX2::Mask<FromT> x, enable_if<Scalar::is_mask<To>::value>)
- {
- return static_cast<To>(x[0]);
- }
- template <typename Return, int offset, typename From>
- Vc_INTRINSIC Vc_CONST enable_if<
- (offset == 0 &&
- ((AVX2::is_vector<From>::value && !Scalar::is_vector<Return>::value &&
- Traits::is_simd_vector<Return>::value && !Traits::isSimdArray<Return>::value) ||
- (AVX2::is_mask<From>::value && !Scalar::is_mask<Return>::value &&
- Traits::is_simd_mask<Return>::value &&
- !Traits::isSimdMaskArray<Return>::value))),
- Return>
- simd_cast(const From &x)
- {
- return simd_cast<Return>(x);
- }
- template <typename Return, int offset, typename From>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(const From &x,
- enable_if<offset == 0 && ((SSE::is_vector<From>::value &&
- AVX2::is_vector<Return>::value) ||
- (SSE::is_mask<From>::value &&
- AVX2::is_mask<Return>::value))>)
- {
- return simd_cast<Return>(x);
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector<Return>::value && offset != 0),
- Return>
- simd_cast(AVX2::Vector<T> x)
- {
- using V = AVX2::Vector<T>;
- constexpr int shift = sizeof(T) * offset * Return::Size;
- static_assert(shift > 0 && shift < sizeof(x), "");
- if (shift < 16) {
- return simd_cast<Return>(V{AVX::avx_cast<typename V::VectorType>(
- _mm_srli_si128(AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))});
- } else if (shift == 16) {
- return simd_cast<Return>(V{Mem::permute128<X1, Const0>(x.data())});
- } else {
- #ifdef Vc_MSVC
- #pragma warning(push)
- #pragma warning(disable : 4556)
- #endif
- return simd_cast<Return>(V{AVX::avx_cast<typename V::VectorType>(
- _mm_srli_si128(AVX::avx_cast<__m128i>(AVX::hi128(x.data())), shift - 16))});
- #ifdef Vc_MSVC
- #pragma warning(pop)
- #endif
- }
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
- sizeof(AVX2::Vector<T>) == 32),
- Return>
- simd_cast(AVX2::Vector<T> x)
- {
- using V = AVX2::Vector<T>;
- constexpr int shift = sizeof(V) / V::Size * offset * Return::Size;
- static_assert(shift > 0, "");
- static_assert(shift < sizeof(V), "");
- using SseVector = SSE::Vector<typename V::EntryType>;
- if (shift == 16) {
- return simd_cast<Return>(SseVector{AVX::hi128(x.data())});
- }
- using Intrin = typename SseVector::VectorType;
- return simd_cast<Return>(SseVector{AVX::avx_cast<Intrin>(
- _mm_alignr_epi8(AVX::avx_cast<__m128i>(AVX::hi128(x.data())),
- AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))});
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
- sizeof(AVX2::Vector<T>) == 16),
- Return>
- simd_cast(AVX2::Vector<T> x)
- {
- using V = AVX2::Vector<T>;
- constexpr int shift = sizeof(V) / V::Size * offset * Return::Size;
- static_assert(shift > 0, "");
- static_assert(shift < sizeof(V), "");
- using SseVector = SSE::Vector<typename V::EntryType>;
- return simd_cast<Return>(SseVector{_mm_srli_si128(x.data(), shift)});
- }
- Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v, 1>(x)); }
- Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v, 1>(x)); }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(const AVX2::Mask<T> &k,
- enable_if<(AVX2::is_mask<Return>::value && offset == 1 &&
- AVX2::Mask<T>::Size == Return::Size * 2)> = nullarg)
- {
- const auto tmp = AVX::hi128(k.dataI());
- return AVX::concat(_mm_unpacklo_epi8(tmp, tmp), _mm_unpackhi_epi8(tmp, tmp));
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(const AVX2::Mask<T> &k,
- enable_if<(AVX2::is_mask<Return>::value && offset == 1 &&
- AVX2::Mask<T>::Size == Return::Size * 4)> = nullarg)
- {
- auto tmp = AVX::lo128(k.dataI());
- tmp = _mm_unpackhi_epi8(tmp, tmp);
- return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp));
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(const AVX2::Mask<T> &k,
- enable_if<(AVX2::is_mask<Return>::value && offset == 2 &&
- AVX2::Mask<T>::Size == Return::Size * 4)> = nullarg)
- {
- auto tmp = AVX::hi128(k.dataI());
- tmp = _mm_unpacklo_epi8(tmp, tmp);
- return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp));
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast(const AVX2::Mask<T> &k,
- enable_if<(AVX2::is_mask<Return>::value && offset == 3 &&
- AVX2::Mask<T>::Size == Return::Size * 4)> = nullarg)
- {
- auto tmp = AVX::hi128(k.dataI());
- tmp = _mm_unpackhi_epi8(tmp, tmp);
- return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp));
- }
- Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
- Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
- sizeof(AVX2::Mask<T>) == 32),
- Return>
- simd_cast(AVX2::Mask<T> x)
- {
- using M = AVX2::Mask<T>;
- constexpr int shift = sizeof(M) / M::Size * offset * Return::Size;
- static_assert(shift > 0, "");
- static_assert(shift < sizeof(M), "");
- using SseVector = SSE::Mask<Traits::entry_type_of<typename M::Vector>>;
- if (shift == 16) {
- return simd_cast<Return>(SseVector{AVX::hi128(x.data())});
- }
- using Intrin = typename SseVector::VectorType;
- return simd_cast<Return>(SseVector{AVX::avx_cast<Intrin>(
- _mm_alignr_epi8(AVX::hi128(x.dataI()), AVX::lo128(x.dataI()), shift))});
- }
- template <typename Return, int offset, typename T>
- Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
- sizeof(AVX2::Mask<T>) == 16),
- Return>
- simd_cast(AVX2::Mask<T> x)
- {
- return simd_cast<Return, offset>(simd_cast<SSE::Mask<T>>(x));
- }
- #undef Vc_SIMD_CAST_AVX_1
- #undef Vc_SIMD_CAST_AVX_2
- #undef Vc_SIMD_CAST_AVX_3
- #undef Vc_SIMD_CAST_AVX_4
- #undef Vc_SIMD_CAST_1
- #undef Vc_SIMD_CAST_2
- #undef Vc_SIMD_CAST_3
- #undef Vc_SIMD_CAST_4
- #undef Vc_SIMD_CAST_5
- #undef Vc_SIMD_CAST_6
- #undef Vc_SIMD_CAST_7
- #undef Vc_SIMD_CAST_8
- #undef Vc_SIMD_CAST_OFFSET
- }
- #endif
- #endif
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- using double_v = Vector<double>;
- using float_v = Vector<float>;
- using int_v = Vector<int>;
- using uint_v = Vector<uint>;
- using short_v = Vector<short>;
- using ushort_v = Vector<ushort>;
- using llong_v = Vector<llong>;
- using ullong_v = Vector<ullong>;
- using long_v = Vector<long>;
- using ulong_v = Vector<ulong>;
- using schar_v = Vector<schar>;
- using uchar_v = Vector<uchar>;
- using double_m = Mask<double>;
- using float_m = Mask< float>;
- using llong_m = Mask< llong>;
- using ullong_m = Mask<ullong>;
- using long_m = Mask< long>;
- using ulong_m = Mask< ulong>;
- using int_m = Mask< int>;
- using uint_m = Mask< uint>;
- using short_m = Mask< short>;
- using ushort_m = Mask<ushort>;
- using schar_m = Mask< schar>;
- using uchar_m = Mask< uchar>;
- typedef Vector<std:: int_least64_t> int_least64_v;
- typedef Vector<std::uint_least64_t> uint_least64_v;
- typedef Vector<std:: int_least32_t> int_least32_v;
- typedef Vector<std::uint_least32_t> uint_least32_v;
- typedef Vector<std:: int_least16_t> int_least16_v;
- typedef Vector<std::uint_least16_t> uint_least16_v;
- typedef Vector<std:: int_least8_t> int_least8_v;
- typedef Vector<std:: uint_least8_t> uint_least8_v;
- typedef Mask<std:: int_least64_t> int_least64_m;
- typedef Mask<std::uint_least64_t> uint_least64_m;
- typedef Mask<std:: int_least32_t> int_least32_m;
- typedef Mask<std::uint_least32_t> uint_least32_m;
- typedef Mask<std:: int_least16_t> int_least16_m;
- typedef Mask<std::uint_least16_t> uint_least16_m;
- typedef Mask<std:: int_least8_t> int_least8_m;
- typedef Mask<std:: uint_least8_t> uint_least8_m;
- typedef Vector<std:: int_fast64_t> int_fast64_v;
- typedef Vector<std::uint_fast64_t> uint_fast64_v;
- typedef Vector<std:: int_fast32_t> int_fast32_v;
- typedef Vector<std::uint_fast32_t> uint_fast32_v;
- typedef Vector<std:: int_fast16_t> int_fast16_v;
- typedef Vector<std::uint_fast16_t> uint_fast16_v;
- typedef Vector<std:: int_fast8_t> int_fast8_v;
- typedef Vector<std:: uint_fast8_t> uint_fast8_v;
- typedef Mask<std:: int_fast64_t> int_fast64_m;
- typedef Mask<std::uint_fast64_t> uint_fast64_m;
- typedef Mask<std:: int_fast32_t> int_fast32_m;
- typedef Mask<std::uint_fast32_t> uint_fast32_m;
- typedef Mask<std:: int_fast16_t> int_fast16_m;
- typedef Mask<std::uint_fast16_t> uint_fast16_m;
- typedef Mask<std:: int_fast8_t> int_fast8_m;
- typedef Mask<std:: uint_fast8_t> uint_fast8_m;
- #if defined INT64_MAX && defined UINT64_MAX
- typedef Vector<std:: int64_t> int64_v;
- typedef Vector<std::uint64_t> uint64_v;
- typedef Mask<std:: int64_t> int64_m;
- typedef Mask<std::uint64_t> uint64_m;
- #endif
- #if defined INT32_MAX && defined UINT32_MAX
- typedef Vector<std:: int32_t> int32_v;
- typedef Vector<std::uint32_t> uint32_v;
- typedef Mask<std:: int32_t> int32_m;
- typedef Mask<std::uint32_t> uint32_m;
- #endif
- #if defined INT16_MAX && defined UINT16_MAX
- typedef Vector<std:: int16_t> int16_v;
- typedef Vector<std::uint16_t> uint16_v;
- typedef Mask<std:: int16_t> int16_m;
- typedef Mask<std::uint16_t> uint16_m;
- #endif
- #if defined INT8_MAX && defined UINT8_MAX
- typedef Vector<std:: int8_t> int8_v;
- typedef Vector<std::uint8_t> uint8_v;
- typedef Mask<std:: int8_t> int8_m;
- typedef Mask<std::uint8_t> uint8_m;
- #endif
- namespace {
- static_assert(double_v::Size == Vc_DOUBLE_V_SIZE, "Vc_DOUBLE_V_SIZE macro defined to an incorrect value");
- static_assert(float_v::Size == Vc_FLOAT_V_SIZE , "Vc_FLOAT_V_SIZE macro defined to an incorrect value ");
- static_assert(int_v::Size == Vc_INT_V_SIZE , "Vc_INT_V_SIZE macro defined to an incorrect value ");
- static_assert(uint_v::Size == Vc_UINT_V_SIZE , "Vc_UINT_V_SIZE macro defined to an incorrect value ");
- static_assert(short_v::Size == Vc_SHORT_V_SIZE , "Vc_SHORT_V_SIZE macro defined to an incorrect value ");
- static_assert(ushort_v::Size == Vc_USHORT_V_SIZE, "Vc_USHORT_V_SIZE macro defined to an incorrect value");
- }
- }
- #ifndef COMMON_OPERATORS_H_
- #define COMMON_OPERATORS_H_
- #ifndef VC_COMMON_SIMDARRAY_H_
- #define VC_COMMON_SIMDARRAY_H_
- #include <array>
- #ifndef VC_COMMON_SIMDARRAYHELPER_H_
- #define VC_COMMON_SIMDARRAYHELPER_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace
- {
- static constexpr struct private_init_t {} private_init = {};
- }
- namespace Common
- {
- namespace Operations
- {
- struct tag {};
- #define Vc_DEFINE_OPERATION(name_) \
- struct name_ : public tag { \
- template <typename V, typename... Args> \
- Vc_INTRINSIC void operator()(V &v, Args &&... args) \
- { \
- v.name_(std::forward<Args>(args)...); \
- } \
- }
- Vc_DEFINE_OPERATION(gather);
- Vc_DEFINE_OPERATION(scatter);
- Vc_DEFINE_OPERATION(load);
- Vc_DEFINE_OPERATION(store);
- Vc_DEFINE_OPERATION(setZero);
- Vc_DEFINE_OPERATION(setZeroInverted);
- Vc_DEFINE_OPERATION(assign);
- #undef Vc_DEFINE_OPERATION
- #define Vc_DEFINE_OPERATION(name_,code_) \
- struct name_ : public tag { \
- template <typename V> Vc_INTRINSIC void operator()(V &v) { code_; } \
- }
- Vc_DEFINE_OPERATION(increment, ++(v));
- Vc_DEFINE_OPERATION(decrement, --(v));
- Vc_DEFINE_OPERATION(random, v = V::Random());
- #undef Vc_DEFINE_OPERATION
- #define Vc_DEFINE_OPERATION_FORWARD(name_) \
- struct Forward_##name_ : public tag \
- { \
- template <typename... Args, typename = decltype(name_(std::declval<Args>()...))> \
- Vc_INTRINSIC void operator()(decltype(name_(std::declval<Args>()...)) &v, \
- Args &&... args) \
- { \
- v = name_(std::forward<Args>(args)...); \
- } \
- template <typename... Args, typename = decltype(name_(std::declval<Args>()...))> \
- Vc_INTRINSIC void operator()(std::nullptr_t, Args && ... args) \
- { \
- name_(std::forward<Args>(args)...); \
- } \
- }
- Vc_DEFINE_OPERATION_FORWARD(abs);
- Vc_DEFINE_OPERATION_FORWARD(asin);
- Vc_DEFINE_OPERATION_FORWARD(atan);
- Vc_DEFINE_OPERATION_FORWARD(atan2);
- Vc_DEFINE_OPERATION_FORWARD(cos);
- Vc_DEFINE_OPERATION_FORWARD(ceil);
- Vc_DEFINE_OPERATION_FORWARD(copysign);
- Vc_DEFINE_OPERATION_FORWARD(exp);
- Vc_DEFINE_OPERATION_FORWARD(exponent);
- Vc_DEFINE_OPERATION_FORWARD(fma);
- Vc_DEFINE_OPERATION_FORWARD(floor);
- Vc_DEFINE_OPERATION_FORWARD(frexp);
- Vc_DEFINE_OPERATION_FORWARD(isfinite);
- Vc_DEFINE_OPERATION_FORWARD(isinf);
- Vc_DEFINE_OPERATION_FORWARD(isnan);
- Vc_DEFINE_OPERATION_FORWARD(isnegative);
- Vc_DEFINE_OPERATION_FORWARD(ldexp);
- Vc_DEFINE_OPERATION_FORWARD(log);
- Vc_DEFINE_OPERATION_FORWARD(log10);
- Vc_DEFINE_OPERATION_FORWARD(log2);
- Vc_DEFINE_OPERATION_FORWARD(reciprocal);
- Vc_DEFINE_OPERATION_FORWARD(round);
- Vc_DEFINE_OPERATION_FORWARD(rsqrt);
- Vc_DEFINE_OPERATION_FORWARD(sin);
- Vc_DEFINE_OPERATION_FORWARD(sincos);
- Vc_DEFINE_OPERATION_FORWARD(sqrt);
- Vc_DEFINE_OPERATION_FORWARD(trunc);
- Vc_DEFINE_OPERATION_FORWARD(min);
- Vc_DEFINE_OPERATION_FORWARD(max);
- #undef Vc_DEFINE_OPERATION_FORWARD
- template<typename T> using is_operation = std::is_base_of<tag, T>;
- }
- template <typename T_, std::size_t Pieces_, std::size_t Index_> struct Segment
- {
- static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report.");
- using type = T_;
- using type_decayed = typename std::decay<type>::type;
- static constexpr std::size_t Pieces = Pieces_;
- static constexpr std::size_t Index = Index_;
- using fixed_size_type =
- fixed_size_simd<conditional_t<Traits::is_simd_vector<type_decayed>::value,
- typename type_decayed::EntryType, float>,
- type_decayed::Size / Pieces>;
- type data;
- static constexpr std::size_t EntryOffset = Index * type_decayed::Size / Pieces;
- decltype(std::declval<const type &>()[0]) operator[](size_t i) const { return data[i + EntryOffset]; }
- fixed_size_type to_fixed_size() const
- {
- return simd_cast<fixed_size_type, Index>(data);
- }
- };
- template <typename T_, std::size_t Pieces_, std::size_t Index_>
- struct Segment<T_ *, Pieces_, Index_> {
- static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report.");
- using type = T_ *;
- using type_decayed = typename std::decay<T_>::type;
- static constexpr size_t Pieces = Pieces_;
- static constexpr size_t Index = Index_;
- using fixed_size_type = fixed_size_simd<
- typename std::conditional<Traits::is_simd_vector<type_decayed>::value,
- typename type_decayed::VectorEntryType, float>::type,
- type_decayed::Size / Pieces> *;
- type data;
- static constexpr std::size_t EntryOffset = Index * type_decayed::size() / Pieces;
- fixed_size_type to_fixed_size() const
- {
- return reinterpret_cast<
- #ifdef Vc_GCC
- typename std::remove_pointer<fixed_size_type>::type
- #else
- MayAlias<typename std::remove_pointer<fixed_size_type>::type>
- #endif
- *>(data) +
- Index;
- }
- };
- template <typename T, std::size_t Offset> struct AddOffset
- {
- constexpr AddOffset() = default;
- };
- template <std::size_t secondOffset> class Split
- {
- template <typename U, std::size_t N, typename V, std::size_t M,
- typename = enable_if<N != M>>
- static Vc_INTRINSIC auto loImpl(const SimdArray<U, N, V, M> &x)
- -> decltype(internal_data0(x))
- {
- return internal_data0(x);
- }
- template <typename U, std::size_t N, typename V, std::size_t M,
- typename = enable_if<N != M>>
- static Vc_INTRINSIC auto hiImpl(const SimdArray<U, N, V, M> &x)
- -> decltype(internal_data1(x))
- {
- return internal_data1(x);
- }
- template <typename U, std::size_t N, typename V, std::size_t M,
- typename = enable_if<N != M>>
- static Vc_INTRINSIC auto loImpl(SimdArray<U, N, V, M> *x)
- -> decltype(&internal_data0(*x))
- {
- return &internal_data0(*x);
- }
- template <typename U, std::size_t N, typename V, std::size_t M,
- typename = enable_if<N != M>>
- static Vc_INTRINSIC auto hiImpl(SimdArray<U, N, V, M> *x)
- -> decltype(&internal_data1(*x))
- {
- return &internal_data1(*x);
- }
- template <typename U, std::size_t N, typename V>
- static Vc_INTRINSIC Segment<V, 2, 0> loImpl(const SimdArray<U, N, V, N> &x)
- {
- return {internal_data(x)};
- }
- template <typename U, std::size_t N, typename V>
- static Vc_INTRINSIC Segment<V, 2, 1> hiImpl(const SimdArray<U, N, V, N> &x)
- {
- return {internal_data(x)};
- }
- template <typename U, std::size_t N, typename V>
- static Vc_INTRINSIC Segment<V *, 2, 0> loImpl(SimdArray<U, N, V, N> *x)
- {
- return {&internal_data(*x)};
- }
- template <typename U, std::size_t N, typename V>
- static Vc_INTRINSIC Segment<V *, 2, 1> hiImpl(SimdArray<U, N, V, N> *x)
- {
- return {&internal_data(*x)};
- }
- template <typename U, std::size_t N, typename V, std::size_t M>
- static Vc_INTRINSIC auto loImpl(const SimdMaskArray<U, N, V, M> &x) -> decltype(internal_data0(x))
- {
- return internal_data0(x);
- }
- template <typename U, std::size_t N, typename V, std::size_t M>
- static Vc_INTRINSIC auto hiImpl(const SimdMaskArray<U, N, V, M> &x) -> decltype(internal_data1(x))
- {
- return internal_data1(x);
- }
- template <typename U, std::size_t N, typename V>
- static Vc_INTRINSIC Segment<typename SimdMaskArray<U, N, V, N>::mask_type, 2, 0> loImpl(
- const SimdMaskArray<U, N, V, N> &x)
- {
- return {internal_data(x)};
- }
- template <typename U, std::size_t N, typename V>
- static Vc_INTRINSIC Segment<typename SimdMaskArray<U, N, V, N>::mask_type, 2, 1> hiImpl(
- const SimdMaskArray<U, N, V, N> &x)
- {
- return {internal_data(x)};
- }
- #ifdef Vc_IMPL_AVX
- template <class T>
- static Vc_INTRINSIC SSE::Vector<T> loImpl(Vector<T, VectorAbi::Avx> &&x)
- {
- return simd_cast<SSE::Vector<T>, 0>(x);
- }
- template <class T>
- static Vc_INTRINSIC SSE::Vector<T> hiImpl(Vector<T, VectorAbi::Avx> &&x)
- {
- return simd_cast<SSE::Vector<T>, 1>(x);
- }
- template <class T>
- static Vc_INTRINSIC SSE::Mask<T> loImpl(Mask<T, VectorAbi::Avx> &&x)
- {
- return simd_cast<SSE::Mask<T>, 0>(x);
- }
- template <class T>
- static Vc_INTRINSIC SSE::Mask<T> hiImpl(Mask<T, VectorAbi::Avx> &&x)
- {
- return simd_cast<SSE::Mask<T>, 1>(x);
- }
- #endif
- template <typename T>
- static constexpr bool is_vector_or_mask(){
- return (Traits::is_simd_vector<T>::value && !Traits::isSimdArray<T>::value) ||
- (Traits::is_simd_mask<T>::value && !Traits::isSimdMaskArray<T>::value);
- }
- template <typename V>
- static Vc_INTRINSIC Segment<V, 2, 0> loImpl(V &&x, enable_if<is_vector_or_mask<V>()> = nullarg)
- {
- return {std::forward<V>(x)};
- }
- template <typename V>
- static Vc_INTRINSIC Segment<V, 2, 1> hiImpl(V &&x, enable_if<is_vector_or_mask<V>()> = nullarg)
- {
- return {std::forward<V>(x)};
- }
- template <class T, class A>
- static Vc_INTRINSIC const T *loImpl(const std::vector<T, A> &x)
- {
- return x.data();
- }
- template <class T, class A>
- static Vc_INTRINSIC const T *hiImpl(const std::vector<T, A> &x)
- {
- return x.data() + secondOffset;
- }
- template <typename V, std::size_t Pieces, std::size_t Index>
- static Vc_INTRINSIC Segment<V, 2 * Pieces, 2 * Index> loImpl(
- const Segment<V, Pieces, Index> &x)
- {
- return {x.data};
- }
- template <typename V, std::size_t Pieces, std::size_t Index>
- static Vc_INTRINSIC Segment<V, 2 * Pieces, 2 * Index + 1> hiImpl(
- const Segment<V, Pieces, Index> &x)
- {
- return {x.data};
- }
- template <typename T, typename = decltype(loImpl(std::declval<T>()))>
- static std::true_type have_lo_impl(int);
- template <typename T> static std::false_type have_lo_impl(float);
- template <typename T> static constexpr bool have_lo_impl()
- {
- return decltype(have_lo_impl<T>(1))::value;
- }
- template <typename T, typename = decltype(hiImpl(std::declval<T>()))>
- static std::true_type have_hi_impl(int);
- template <typename T> static std::false_type have_hi_impl(float);
- template <typename T> static constexpr bool have_hi_impl()
- {
- return decltype(have_hi_impl<T>(1))::value;
- }
- public:
- template <typename U>
- static Vc_INTRINSIC const U *lo(Operations::gather, const U *ptr)
- {
- return ptr;
- }
- template <typename U>
- static Vc_INTRINSIC const U *hi(Operations::gather, const U *ptr)
- {
- return ptr + secondOffset;
- }
- template <typename U, typename = enable_if<!std::is_pointer<U>::value>>
- static Vc_ALWAYS_INLINE decltype(loImpl(std::declval<U>()))
- lo(Operations::gather, U &&x)
- {
- return loImpl(std::forward<U>(x));
- }
- template <typename U, typename = enable_if<!std::is_pointer<U>::value>>
- static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval<U>()))
- hi(Operations::gather, U &&x)
- {
- return hiImpl(std::forward<U>(x));
- }
- template <typename U>
- static Vc_INTRINSIC const U *lo(Operations::scatter, const U *ptr)
- {
- return ptr;
- }
- template <typename U>
- static Vc_INTRINSIC const U *hi(Operations::scatter, const U *ptr)
- {
- return ptr + secondOffset;
- }
- template <typename U>
- static Vc_ALWAYS_INLINE decltype(loImpl(std::declval<U>())) lo(U &&x)
- {
- return loImpl(std::forward<U>(x));
- }
- template <typename U>
- static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval<U>())) hi(U &&x)
- {
- return hiImpl(std::forward<U>(x));
- }
- template <typename U>
- static Vc_ALWAYS_INLINE enable_if<!have_lo_impl<U>(), U> lo(U &&x)
- {
- return std::forward<U>(x);
- }
- template <typename U>
- static Vc_ALWAYS_INLINE enable_if<!have_hi_impl<U>(), U> hi(U &&x)
- {
- return std::forward<U>(x);
- }
- };
- template <typename Op, typename U, std::size_t M, typename V>
- static Vc_INTRINSIC const V &actual_value(Op, const SimdArray<U, M, V, M> &x)
- {
- return internal_data(x);
- }
- template <typename Op, typename U, std::size_t M, typename V>
- static Vc_INTRINSIC V *actual_value(Op, SimdArray<U, M, V, M> *x)
- {
- return &internal_data(*x);
- }
- template <typename Op, typename T, size_t Pieces, size_t Index>
- static Vc_INTRINSIC typename Segment<T, Pieces, Index>::fixed_size_type actual_value(
- Op, Segment<T, Pieces, Index> &&seg)
- {
- return seg.to_fixed_size();
- }
- template <typename Op, typename U, std::size_t M, typename V>
- static Vc_INTRINSIC const typename V::Mask &actual_value(Op, const SimdMaskArray<U, M, V, M> &x)
- {
- return internal_data(x);
- }
- template <typename Op, typename U, std::size_t M, typename V>
- static Vc_INTRINSIC typename V::Mask *actual_value(Op, SimdMaskArray<U, M, V, M> *x)
- {
- return &internal_data(*x);
- }
- template <typename Op, typename Arg>
- Vc_INTRINSIC decltype(actual_value(std::declval<Op &>(), std::declval<Arg>()))
- conditionalUnpack(std::true_type, Op op, Arg &&arg)
- {
- return actual_value(op, std::forward<Arg>(arg));
- }
- template <typename Op, typename Arg>
- Vc_INTRINSIC Arg conditionalUnpack(std::false_type, Op, Arg &&arg)
- {
- return std::forward<Arg>(arg);
- }
- template <size_t A, size_t B>
- struct selectorType : public std::integral_constant<bool, !((A & (size_t(1) << B)) != 0)> {
- };
- template <size_t I, typename Op, typename R, typename... Args, size_t... Indexes>
- Vc_INTRINSIC decltype(std::declval<Op &>()(std::declval<R &>(),
- conditionalUnpack(selectorType<I, Indexes>(),
- std::declval<Op &>(),
- std::declval<Args>())...))
- unpackArgumentsAutoImpl(int, index_sequence<Indexes...>, Op op, R &&r, Args &&... args)
- {
- op(std::forward<R>(r),
- conditionalUnpack(selectorType<I, Indexes>(), op, std::forward<Args>(args))...);
- }
- template <size_t I, typename Op, typename R, typename... Args, size_t... Indexes>
- Vc_INTRINSIC enable_if<(I <= (size_t(1) << sizeof...(Args))), void> unpackArgumentsAutoImpl(
- float, index_sequence<Indexes...> is, Op op, R &&r, Args &&... args)
- {
- static_assert(
- I < (1 << sizeof...(Args)) - (std::is_same<R, std::nullptr_t>::value ? 1 : 0),
- "Vc or compiler bug. Please report. Failed to find a combination of "
- "actual_value(arg) transformations that allows calling Op.");
- unpackArgumentsAutoImpl<I + 1, Op, R, Args...>(int(), is, op, std::forward<R>(r),
- std::forward<Args>(args)...);
- }
- #ifdef Vc_ICC
- template <size_t, typename... Ts> struct IccWorkaround {
- using type = void;
- };
- template <typename... Ts> struct IccWorkaround<2, Ts...> {
- using type = typename std::remove_pointer<typename std::decay<
- typename std::tuple_element<1, std::tuple<Ts...>>::type>::type>::type;
- };
- #endif
- template <typename Op, typename R, typename... Args>
- Vc_INTRINSIC void unpackArgumentsAuto(Op op, R &&r, Args &&... args)
- {
- #ifdef Vc_ICC
- const int recursionStart =
- Traits::isSimdArray<
- typename IccWorkaround<sizeof...(Args), Args...>::type>::value &&
- (std::is_same<Op, Common::Operations::Forward_frexp>::value ||
- std::is_same<Op, Common::Operations::Forward_ldexp>::value)
- ? 2
- : 0;
- #else
- const int recursionStart = 0;
- #endif
- unpackArgumentsAutoImpl<recursionStart>(
- int(), make_index_sequence<sizeof...(Args)>(), op, std::forward<R>(r),
- std::forward<Args>(args)...);
- }
- }
- }
- #endif
- #ifndef VC_COMMON_SIMDMASKARRAY_H_
- #define VC_COMMON_SIMDMASKARRAY_H_
- #include <type_traits>
- #include <array>
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename T, std::size_t N, typename VectorType_>
- class SimdMaskArray<T, N, VectorType_, N>
- {
- public:
- using VectorType = VectorType_;
- using vector_type = VectorType;
- using mask_type = typename vector_type::Mask;
- using storage_type = mask_type;
- friend storage_type &internal_data(SimdMaskArray &m) { return m.data; }
- friend const storage_type &internal_data(const SimdMaskArray &m) { return m.data; }
- static constexpr std::size_t size() { return N; }
- static constexpr std::size_t Size = size();
- static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment;
- static_assert(Size == vector_type::Size, "size mismatch");
- using vectorentry_type = typename mask_type::VectorEntryType;
- using value_type = typename mask_type::EntryType;
- using Mask = mask_type;
- using VectorEntryType = vectorentry_type;
- using EntryType = value_type;
- using EntryReference = Vc::Detail::ElementReference<storage_type, SimdMaskArray>;
- using reference = EntryReference;
- using Vector = fixed_size_simd<T, N>;
- Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type));
- SimdMaskArray() = default;
- SimdMaskArray(const SimdMaskArray &) = default;
- SimdMaskArray(SimdMaskArray &&) = default;
- SimdMaskArray &operator=(const SimdMaskArray &) = default;
- SimdMaskArray &operator=(SimdMaskArray &&) = default;
- Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one) : data(one) {}
- Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero) : data(zero) {}
- Vc_INTRINSIC explicit SimdMaskArray(bool b) : data(b) {}
- Vc_INTRINSIC static SimdMaskArray Zero() { return {private_init, storage_type::Zero()}; }
- Vc_INTRINSIC static SimdMaskArray One() { return {private_init, storage_type::One()}; }
- template <class U, class V, class = enable_if<N == V::Size>>
- Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
- template <class U, class V, class = enable_if<(N > V::Size && N <= 2 * V::Size)>,
- class = U>
- Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
- template <class U, class V, class = enable_if<(N > 2 * V::Size && N <= 4 * V::Size)>,
- class = U, class = U>
- Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
- template <typename M, std::size_t Pieces, std::size_t Index>
- Vc_INTRINSIC_L SimdMaskArray(
- Common::Segment<M, Pieces, Index> &&x,
- enable_if<Traits::simd_vector_size<M>::value == Size * Pieces> = nullarg) Vc_INTRINSIC_R;
- template <class M, class = enable_if<(Traits::is_simd_mask<M>::value &&
- !Traits::isSimdMaskArray<M>::value &&
- Traits::simd_vector_size<M>::value == Size)>>
- Vc_INTRINSIC_L SimdMaskArray(M k) Vc_INTRINSIC_R;
- template <class U, class A,
- class = enable_if<Vc::Mask<U, A>::Size == N &&
- !detail::is_fixed_size_abi<A>::value>>
- operator Vc::Mask<U, A>() const
- {
- return simd_cast<Vc::Mask<U, A>>(data);
- }
- operator fixed_size_simd_mask<T, N> &()
- {
- return static_cast<fixed_size_simd_mask<T, N> &>(*this);
- }
- operator const fixed_size_simd_mask<T, N> &() const
- {
- return static_cast<const fixed_size_simd_mask<T, N> &>(*this);
- }
- template <typename Flags = DefaultLoadTag>
- Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags())
- : data(mem, f)
- {
- }
- Vc_INTRINSIC void load(const bool *mem) { data.load(mem); }
- template <typename Flags> Vc_INTRINSIC void load(const bool *mem, Flags f)
- {
- data.load(mem, f);
- }
- Vc_INTRINSIC void store(bool *mem) const { data.store(mem); }
- template <typename Flags> Vc_INTRINSIC void store(bool *mem, Flags f) const
- {
- data.store(mem, f);
- }
- Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &rhs) const
- {
- return data == rhs.data;
- }
- Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &rhs) const
- {
- return data != rhs.data;
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator!() const
- {
- return {private_init, !data};
- }
- Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs)
- {
- data &= rhs.data;
- return *this;
- }
- Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs)
- {
- data |= rhs.data;
- return *this;
- }
- Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs)
- {
- data ^= rhs.data;
- return *this;
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&(
- const SimdMaskArray &rhs) const
- {
- return {private_init, data & rhs.data};
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator|(
- const SimdMaskArray &rhs) const
- {
- return {private_init, data | rhs.data};
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator^(
- const SimdMaskArray &rhs) const
- {
- return {private_init, data ^ rhs.data};
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&&(
- const SimdMaskArray &rhs) const
- {
- return {private_init, data && rhs.data};
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator||(
- const SimdMaskArray &rhs) const
- {
- return {private_init, data || rhs.data};
- }
- Vc_INTRINSIC Vc_PURE bool isFull() const { return data.isFull(); }
- Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data.isNotEmpty(); }
- Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data.isEmpty(); }
- Vc_INTRINSIC Vc_PURE bool isMix() const { return data.isMix(); }
- Vc_INTRINSIC Vc_PURE int shiftMask() const { return data.shiftMask(); }
- Vc_INTRINSIC Vc_PURE int toInt() const { return data.toInt(); }
- private:
- friend reference;
- static Vc_INTRINSIC value_type get(const storage_type &k, int i) noexcept
- {
- return k[i];
- }
- template <typename U>
- static Vc_INTRINSIC void set(storage_type &k, int i, U &&v) noexcept(
- noexcept(std::declval<storage_type &>()[0] = std::declval<U>()))
- {
- k[i] = std::forward<U>(v);
- }
- public:
- Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept
- {
- return {data, int(index)};
- }
- Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept
- {
- return data[index];
- }
- Vc_INTRINSIC Vc_PURE int count() const { return data.count(); }
- Vc_INTRINSIC Vc_PURE int firstOne() const { return data.firstOne(); }
- template <typename G>
- static Vc_INTRINSIC fixed_size_simd_mask<T, N> generate(const G &gen)
- {
- return {private_init, mask_type::generate(gen)};
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> shifted(int amount) const
- {
- return {private_init, data.shifted(amount)};
- }
- template <typename Op, typename... Args>
- static Vc_INTRINSIC fixed_size_simd_mask<T, N> fromOperation(Op op, Args &&... args)
- {
- fixed_size_simd_mask<T, N> r;
- Common::unpackArgumentsAuto(op, r.data, std::forward<Args>(args)...);
- return r;
- }
- Vc_INTRINSIC SimdMaskArray(private_init_t, mask_type &&x) : data(std::move(x)) {}
- private:
- alignas(static_cast<std::size_t>(
- Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(VectorType_) /
- VectorType_::size()>::value)) storage_type data;
- };
- template <typename T, std::size_t N, typename VectorType> constexpr std::size_t SimdMaskArray<T, N, VectorType, N>::Size;
- template <typename T, std::size_t N, typename VectorType>
- constexpr std::size_t SimdMaskArray<T, N, VectorType, N>::MemoryAlignment;
- template <typename T, size_t N, typename V, size_t Wt>
- class SimdMaskArray
- {
- static constexpr std::size_t N0 = Common::left_size<N>();
- using Split = Common::Split<N0>;
- public:
- using storage_type0 = fixed_size_simd_mask<T, N0>;
- using storage_type1 = fixed_size_simd_mask<T, N - N0>;
- static_assert(storage_type0::size() == N0, "");
- using vector_type = fixed_size_simd<T, N>;
- friend storage_type0 &internal_data0(SimdMaskArray &m) { return m.data0; }
- friend storage_type1 &internal_data1(SimdMaskArray &m) { return m.data1; }
- friend const storage_type0 &internal_data0(const SimdMaskArray &m) { return m.data0; }
- friend const storage_type1 &internal_data1(const SimdMaskArray &m) { return m.data1; }
- using mask_type = SimdMaskArray;
- static constexpr std::size_t size() { return N; }
- static constexpr std::size_t Size = size();
- static constexpr std::size_t MemoryAlignment =
- storage_type0::MemoryAlignment > storage_type1::MemoryAlignment
- ? storage_type0::MemoryAlignment
- : storage_type1::MemoryAlignment;
- static_assert(Size == vector_type::Size, "size mismatch");
- using vectorentry_type = typename storage_type0::VectorEntryType;
- using value_type = typename storage_type0::EntryType;
- using MaskType = mask_type;
- using VectorEntryType = vectorentry_type;
- using EntryType = value_type;
- using EntryReference = Vc::Detail::ElementReference<SimdMaskArray>;
- using reference = EntryReference;
- using Vector = fixed_size_simd<T, N>;
- Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type));
- SimdMaskArray() = default;
- SimdMaskArray(const SimdMaskArray &) = default;
- SimdMaskArray(SimdMaskArray &&) = default;
- SimdMaskArray &operator=(const SimdMaskArray &) = default;
- SimdMaskArray &operator=(SimdMaskArray &&) = default;
- template <typename U, typename W>
- Vc_INTRINSIC SimdMaskArray(const SimdMaskArray<U, N, W> &rhs)
- : data0(Split::lo(rhs)), data1(Split::hi(rhs))
- {
- }
- template <typename M, std::size_t Pieces, std::size_t Index>
- Vc_INTRINSIC SimdMaskArray(
- Common::Segment<M, Pieces, Index> &&rhs,
- enable_if<Traits::simd_vector_size<M>::value == Size * Pieces> = nullarg)
- : data0(Split::lo(rhs)), data1(Split::hi(rhs))
- {
- }
- template <class M, class = enable_if<(Traits::is_simd_mask<M>::value &&
- !Traits::isSimdMaskArray<M>::value &&
- Traits::simd_vector_size<M>::value == Size)>>
- Vc_INTRINSIC SimdMaskArray(M k) : data0(Split::lo(k)), data1(Split::hi(k))
- {
- }
- template <class U, class A,
- class = enable_if<Vc::Mask<U, A>::Size == N &&
- !detail::is_fixed_size_abi<A>::value>>
- operator Vc::Mask<U, A>() const
- {
- return simd_cast<Vc::Mask<U, A>>(data0, data1);
- }
- Vc_INTRINSIC operator fixed_size_simd_mask<T, N> &()
- {
- return static_cast<fixed_size_simd_mask<T, N> &>(*this);
- }
- Vc_INTRINSIC operator const fixed_size_simd_mask<T, N> &() const
- {
- return static_cast<const fixed_size_simd_mask<T, N> &>(*this);
- }
- Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one)
- : data0(one), data1(one)
- {
- }
- Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero)
- : data0(zero), data1(zero)
- {
- }
- Vc_INTRINSIC explicit SimdMaskArray(bool b) : data0(b), data1(b) {}
- Vc_INTRINSIC static fixed_size_simd_mask<T, N> Zero()
- {
- return {storage_type0::Zero(), storage_type1::Zero()};
- }
- Vc_INTRINSIC static fixed_size_simd_mask<T, N> One()
- {
- return {storage_type0::One(), storage_type1::One()};
- }
- template <typename Flags = DefaultLoadTag>
- Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags())
- : data0(mem, f), data1(mem + storage_type0::size(), f)
- {
- }
- Vc_INTRINSIC void load(const bool *mem)
- {
- data0.load(mem);
- data1.load(mem + storage_type0::size());
- }
- template <typename Flags> Vc_INTRINSIC void load(const bool *mem, Flags f)
- {
- data0.load(mem, f);
- data1.load(mem + storage_type0::size(), f);
- }
- Vc_INTRINSIC void store(bool *mem) const
- {
- data0.store(mem);
- data1.store(mem + storage_type0::size());
- }
- template <typename Flags> Vc_INTRINSIC void store(bool *mem, Flags f) const
- {
- data0.store(mem, f);
- data1.store(mem + storage_type0::size(), f);
- }
- Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &mask) const
- {
- return data0 == mask.data0 && data1 == mask.data1;
- }
- Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &mask) const
- {
- return data0 != mask.data0 || data1 != mask.data1;
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator!() const
- {
- return {!data0, !data1};
- }
- Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs)
- {
- data0 &= rhs.data0;
- data1 &= rhs.data1;
- return *this;
- }
- Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs)
- {
- data0 |= rhs.data0;
- data1 |= rhs.data1;
- return *this;
- }
- Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs)
- {
- data0 ^= rhs.data0;
- data1 ^= rhs.data1;
- return *this;
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&(
- const SimdMaskArray &rhs) const
- {
- return {data0 & rhs.data0, data1 & rhs.data1};
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator|(
- const SimdMaskArray &rhs) const
- {
- return {data0 | rhs.data0, data1 | rhs.data1};
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator^(
- const SimdMaskArray &rhs) const
- {
- return {data0 ^ rhs.data0, data1 ^ rhs.data1};
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&&(
- const SimdMaskArray &rhs) const
- {
- return {data0 && rhs.data0, data1 && rhs.data1};
- }
- Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator||(
- const SimdMaskArray &rhs) const
- {
- return {data0 || rhs.data0, data1 || rhs.data1};
- }
- Vc_INTRINSIC Vc_PURE bool isFull() const { return data0.isFull() && data1.isFull(); }
- Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data0.isNotEmpty() || data1.isNotEmpty(); }
- Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data0.isEmpty() && data1.isEmpty(); }
- Vc_INTRINSIC Vc_PURE bool isMix() const { return !isFull() && !isEmpty(); }
- Vc_INTRINSIC Vc_PURE int toInt() const
- {
- return data0.toInt() | (data1.toInt() << data0.size());
- }
- private:
- friend reference;
- static Vc_INTRINSIC value_type get(const SimdMaskArray &o, int i) noexcept
- {
- if (i < int(o.data0.size())) {
- return o.data0[i];
- } else {
- return o.data1[i - o.data0.size()];
- }
- }
- template <typename U>
- static Vc_INTRINSIC void set(SimdMaskArray &o, int i, U &&v) noexcept(
- noexcept(std::declval<storage_type0 &>()[0] = std::declval<U>()) &&
- noexcept(std::declval<storage_type1 &>()[0] = std::declval<U>()))
- {
- if (i < int(o.data0.size())) {
- o.data0[i] = std::forward<U>(v);
- } else {
- o.data1[i - o.data0.size()] = std::forward<U>(v);
- }
- }
- public:
- Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept
- {
- return {*this, int(index)};
- }
- Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept
- {
- return get(*this, index);
- }
- Vc_INTRINSIC Vc_PURE int count() const { return data0.count() + data1.count(); }
- Vc_INTRINSIC Vc_PURE int firstOne() const {
- if (data0.isEmpty()) {
- return data1.firstOne() + storage_type0::size();
- }
- return data0.firstOne();
- }
- template <typename G>
- static Vc_INTRINSIC fixed_size_simd_mask<T, N> generate(const G &gen)
- {
- return {storage_type0::generate(gen),
- storage_type1::generate([&](std::size_t i) { return gen(i + N0); })};
- }
- inline Vc_PURE fixed_size_simd_mask<T, N> shifted(int amount) const
- {
- if (Vc_IS_UNLIKELY(amount == 0)) {
- return *this;
- }
- return generate([&](unsigned i) {
- const unsigned j = i + amount;
- return j < size() ? get(*this, j) : false;
- });
- }
- template <typename Op, typename... Args>
- static Vc_INTRINSIC fixed_size_simd_mask<T, N> fromOperation(Op op, Args &&... args)
- {
- fixed_size_simd_mask<T, N> r = {
- storage_type0::fromOperation(op, Split::lo(args)...),
- storage_type1::fromOperation(op, Split::hi(std::forward<Args>(args))...)};
- return r;
- }
- Vc_INTRINSIC SimdMaskArray(storage_type0 &&x, storage_type1 &&y)
- : data0(std::move(x)), data1(std::move(y))
- {
- }
- private:
- alignas(static_cast<std::size_t>(
- Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(V) /
- V::size()>::value)) storage_type0 data0;
- storage_type1 data1;
- };
- template <typename T, std::size_t N, typename V, std::size_t M>
- constexpr std::size_t SimdMaskArray<T, N, V, M>::Size;
- template <typename T, std::size_t N, typename V, std::size_t M>
- constexpr std::size_t SimdMaskArray<T, N, V, M>::MemoryAlignment;
- }
- #ifndef VC_COMMON_SIMD_CAST_CALLER_TCC_
- #define VC_COMMON_SIMD_CAST_CALLER_TCC_
- namespace Vc_VERSIONED_NAMESPACE {
- template <class T, std::size_t N, class VectorType>
- template <class U, class V, class>
- Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
- const SimdMaskArray<U, N, V> &x)
- : data(simd_cast<mask_type>(internal_data(x)))
- {
- }
- template <class T, std::size_t N, class VectorType>
- template <class U, class V, class, class>
- Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
- const SimdMaskArray<U, N, V> &x)
- : data(simd_cast<mask_type>(internal_data(internal_data0(x)),
- internal_data(internal_data1(x))))
- {
- }
- template <class T, std::size_t N, class VectorType>
- template <class U, class V, class, class, class>
- Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
- const SimdMaskArray<U, N, V> &x)
- : data(simd_cast<mask_type>(internal_data(internal_data0(internal_data0(x))),
- internal_data(internal_data1(internal_data0(x))),
- internal_data(internal_data0(internal_data1(x))),
- internal_data(internal_data1(internal_data1(x)))))
- {
- }
- template <class T, std::size_t N, class VectorType>
- template <class M, std::size_t Pieces, std::size_t Index>
- Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
- Common::Segment<M, Pieces, Index> &&x,
- enable_if<Traits::simd_vector_size<M>::value == Size * Pieces>)
- : data(simd_cast<mask_type, Index>(x.data))
- {
- }
- template <class T, std::size_t N, class VectorType>
- template <class M, class>
- Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(M k)
- : data(simd_cast<mask_type>(k))
- {
- }
- }
- #endif
- #endif
- #ifndef VC_COMMON_INTERLEAVE_H_
- #define VC_COMMON_INTERLEAVE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename V, typename = enable_if<Traits::is_simd_vector<V>::value>>
- std::pair<V, V> interleave(const V &a, const V &b)
- {
- return {a.interleaveLow(b), a.interleaveHigh(b)};
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- template <std::size_t N, class... Candidates> struct select_best_vector_type_impl;
- template <std::size_t N, class T> struct select_best_vector_type_impl<N, T> {
- using type = T;
- };
- template <std::size_t N, class T, class... Candidates>
- struct select_best_vector_type_impl<N, T, Candidates...> {
- using type = typename std::conditional<
- (N < T::Size), typename select_best_vector_type_impl<N, Candidates...>::type,
- T>::type;
- };
- template <class T, std::size_t N>
- struct select_best_vector_type : select_best_vector_type_impl<N,
- #ifdef Vc_IMPL_AVX2
- Vc::AVX2::Vector<T>,
- #elif defined Vc_IMPL_AVX
- Vc::AVX::Vector<T>,
- #endif
- #ifdef Vc_IMPL_SSE
- Vc::SSE::Vector<T>,
- #endif
- Vc::Scalar::Vector<T>> {
- };
- }
- namespace internal
- {
- template <typename T> T Vc_INTRINSIC Vc_PURE product_helper_(const T &l, const T &r) { return l * r; }
- template <typename T> T Vc_INTRINSIC Vc_PURE sum_helper_(const T &l, const T &r) { return l + r; }
- }
- template <typename T, std::size_t N, typename V, std::size_t M>
- inline fixed_size_simd<T, N> min(const SimdArray<T, N, V, M> &x,
- const SimdArray<T, N, V, M> &y);
- template <typename T, std::size_t N, typename V, std::size_t M>
- inline fixed_size_simd<T, N> max(const SimdArray<T, N, V, M> &x,
- const SimdArray<T, N, V, M> &y);
- #define Vc_CURRENT_CLASS_NAME SimdArray
- template <typename T, std::size_t N, typename VectorType_>
- class SimdArray<T, N, VectorType_, N>
- {
- static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value ||
- std::is_same<T, int32_t>::value ||
- std::is_same<T, uint32_t>::value ||
- std::is_same<T, int16_t>::value ||
- std::is_same<T, uint16_t>::value,
- "SimdArray<T, N> may only be used with T = { double, float, int32_t, uint32_t, "
- "int16_t, uint16_t }");
- static_assert(
- std::is_same<VectorType_,
- typename Common::select_best_vector_type<T, N>::type>::value &&
- VectorType_::size() == N,
- "ERROR: leave the third and fourth template parameters with their defaults. They "
- "are implementation details.");
- public:
- static constexpr bool is_atomic = true;
- using VectorType = VectorType_;
- using vector_type = VectorType;
- using storage_type = vector_type;
- using vectorentry_type = typename vector_type::VectorEntryType;
- using value_type = T;
- using mask_type = fixed_size_simd_mask<T, N>;
- using index_type = fixed_size_simd<int, N>;
- static constexpr std::size_t size() { return N; }
- using Mask = mask_type;
- using MaskType = Mask;
- using MaskArgument = const MaskType &;
- using VectorEntryType = vectorentry_type;
- using EntryType = value_type;
- using IndexType = index_type;
- using AsArg = const SimdArray &;
- using reference = Detail::ElementReference<SimdArray>;
- static constexpr std::size_t Size = size();
- static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment;
- Vc_INTRINSIC SimdArray() = default;
- Vc_INTRINSIC SimdArray(const SimdArray &) = default;
- Vc_INTRINSIC SimdArray(SimdArray &&) = default;
- Vc_INTRINSIC SimdArray &operator=(const SimdArray &) = default;
- Vc_INTRINSIC SimdArray(const value_type &a) : data(a) {}
- Vc_INTRINSIC SimdArray(value_type &a) : data(a) {}
- Vc_INTRINSIC SimdArray(value_type &&a) : data(a) {}
- template <
- typename U,
- typename = enable_if<std::is_same<U, int>::value && !std::is_same<int, value_type>::value>>
- Vc_INTRINSIC SimdArray(U a)
- : SimdArray(static_cast<value_type>(a))
- {
- }
- template <class U, class V, class = enable_if<N == V::Size>>
- Vc_INTRINSIC SimdArray(const SimdArray<U, N, V> &x)
- : data(simd_cast<vector_type>(internal_data(x)))
- {
- }
- template <class U, class V, class = enable_if<(N > V::Size && N <= 2 * V::Size)>,
- class = U>
- Vc_INTRINSIC SimdArray(const SimdArray<U, N, V> &x)
- : data(simd_cast<vector_type>(internal_data(internal_data0(x)),
- internal_data(internal_data1(x))))
- {
- }
- template <class U, class V, class = enable_if<(N > 2 * V::Size && N <= 4 * V::Size)>,
- class = U, class = U>
- Vc_INTRINSIC SimdArray(const SimdArray<U, N, V> &x)
- : data(simd_cast<vector_type>(internal_data(internal_data0(internal_data0(x))),
- internal_data(internal_data1(internal_data0(x))),
- internal_data(internal_data0(internal_data1(x))),
- internal_data(internal_data1(internal_data1(x)))))
- {
- }
- template <typename V, std::size_t Pieces, std::size_t Index>
- Vc_INTRINSIC SimdArray(Common::Segment<V, Pieces, Index> &&x)
- : data(simd_cast<vector_type, Index>(x.data))
- {
- }
- Vc_INTRINSIC SimdArray(const std::initializer_list<value_type> &init)
- : data(init.begin(), Vc::Unaligned)
- {
- Vc_ASSERT(init.size() == size());
- }
- template <
- typename V,
- typename = enable_if<Traits::is_simd_vector<V>::value && !Traits::isSimdArray<V>::value>>
- Vc_INTRINSIC SimdArray(const V &x)
- : data(simd_cast<vector_type>(x))
- {
- }
- template <typename U, typename A,
- typename =
- enable_if<std::is_convertible<T, U>::value && Vector<U, A>::Size == N &&
- !std::is_same<A, simd_abi::fixed_size<N>>::value>>
- Vc_INTRINSIC operator Vector<U, A>() const
- {
- return simd_cast<Vector<U, A>>(data);
- }
- operator fixed_size_simd<T, N> &()
- {
- return static_cast<fixed_size_simd<T, N> &>(*this);
- }
- operator const fixed_size_simd<T, N> &() const
- {
- return static_cast<const fixed_size_simd<T, N> &>(*this);
- }
- #ifndef Vc_CURRENT_CLASS_NAME
- #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
- #endif
- private:
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
- MaskArgument mask);
- public:
- #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<MT, EntryType>::value, \
- "The memory pointer needs to point to a type that can be converted to the " \
- "EntryType of this SIMD vector type."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT, typename IT,
- typename = enable_if<Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
- private:
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes) const;
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
- public:
- #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<EntryType, MT>::value, \
- "The memory pointer needs to point to a type that the EntryType of this " \
- "SIMD vector type can be converted to."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes));
- }
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes), mask);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
- {
- scatter(args.address, args.indexes);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
- {
- scatter(args.address, args.indexes, mask);
- }
- #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
- explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerZero) : data() {}
- explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerOne o) : data(o) {}
- explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerIndexesFromZero i) : data(i)
- {
- }
- template <std::size_t Offset>
- explicit Vc_INTRINSIC SimdArray(
- Common::AddOffset<VectorSpecialInitializerIndexesFromZero, Offset>)
- : data(Vc::IndexesFromZero)
- {
- data += value_type(Offset);
- }
- Vc_INTRINSIC void setZero() { data.setZero(); }
- Vc_INTRINSIC void setZero(mask_type k) { data.setZero(internal_data(k)); }
- Vc_INTRINSIC void setZeroInverted() { data.setZeroInverted(); }
- Vc_INTRINSIC void setZeroInverted(mask_type k) { data.setZeroInverted(internal_data(k)); }
- Vc_INTRINSIC void setQnan() { data.setQnan(); }
- Vc_INTRINSIC void setQnan(mask_type m) { data.setQnan(internal_data(m)); }
- template <typename Op, typename... Args>
- static Vc_INTRINSIC fixed_size_simd<T, N> fromOperation(Op op, Args &&... args)
- {
- fixed_size_simd<T, N> r;
- Common::unpackArgumentsAuto(op, r.data, std::forward<Args>(args)...);
- return r;
- }
- template <typename Op, typename... Args>
- static Vc_INTRINSIC void callOperation(Op op, Args &&... args)
- {
- Common::unpackArgumentsAuto(op, nullptr, std::forward<Args>(args)...);
- }
- static Vc_INTRINSIC fixed_size_simd<T, N> Zero()
- {
- return SimdArray(Vc::Zero);
- }
- static Vc_INTRINSIC fixed_size_simd<T, N> One()
- {
- return SimdArray(Vc::One);
- }
- static Vc_INTRINSIC fixed_size_simd<T, N> IndexesFromZero()
- {
- return SimdArray(Vc::IndexesFromZero);
- }
- static Vc_INTRINSIC fixed_size_simd<T, N> Random()
- {
- return fromOperation(Common::Operations::random());
- }
- template <class U, class Flags = DefaultLoadTag,
- class = enable_if<std::is_arithmetic<U>::value &&
- Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC SimdArray(const U *mem, Flags f = Flags()) : data(mem, f)
- {
- }
- template <typename... Args> Vc_INTRINSIC void load(Args &&... args)
- {
- data.load(std::forward<Args>(args)...);
- }
- template <typename... Args> Vc_INTRINSIC void store(Args &&... args) const
- {
- data.store(std::forward<Args>(args)...);
- }
- Vc_INTRINSIC mask_type operator!() const
- {
- return {private_init, !data};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> operator-() const
- {
- return {private_init, -data};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> operator+() const { return *this; }
- Vc_INTRINSIC fixed_size_simd<T, N> operator~() const
- {
- return {private_init, ~data};
- }
- template <typename U,
- typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
- Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator<<(U x) const
- {
- return {private_init, data << x};
- }
- template <typename U,
- typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
- Vc_INTRINSIC fixed_size_simd<T, N> &operator<<=(U x)
- {
- data <<= x;
- return *this;
- }
- template <typename U,
- typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
- Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator>>(U x) const
- {
- return {private_init, data >> x};
- }
- template <typename U,
- typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
- Vc_INTRINSIC fixed_size_simd<T, N> &operator>>=(U x)
- {
- data >>= x;
- return *this;
- }
- #define Vc_BINARY_OPERATOR_(op) \
- Vc_INTRINSIC fixed_size_simd<T, N> &operator op##=(const SimdArray &rhs) \
- { \
- data op## = rhs.data; \
- return *this; \
- }
- Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_);
- Vc_ALL_BINARY(Vc_BINARY_OPERATOR_);
- Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_);
- #undef Vc_BINARY_OPERATOR_
- Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const
- {
- return {private_init, isnegative(data)};
- }
- private:
- friend reference;
- Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept
- {
- return o.data[i];
- }
- template <typename U>
- Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept(
- noexcept(std::declval<value_type &>() = v))
- {
- o.data[i] = v;
- }
- public:
- Vc_INTRINSIC reference operator[](size_t i) noexcept
- {
- static_assert(noexcept(reference{std::declval<SimdArray &>(), int()}), "");
- return {*this, int(i)};
- }
- Vc_INTRINSIC value_type operator[](size_t i) const noexcept
- {
- return get(*this, int(i));
- }
- Vc_INTRINSIC Common::WriteMaskedVector<SimdArray, mask_type> operator()(const mask_type &k)
- {
- return {*this, k};
- }
- Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k)
- {
- data.assign(v.data, internal_data(k));
- }
- #define Vc_REDUCTION_FUNCTION_(name_) \
- Vc_INTRINSIC Vc_PURE value_type name_() const { return data.name_(); } \
- Vc_INTRINSIC Vc_PURE value_type name_(mask_type mask) const \
- { \
- return data.name_(internal_data(mask)); \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_REDUCTION_FUNCTION_(min);
- Vc_REDUCTION_FUNCTION_(max);
- Vc_REDUCTION_FUNCTION_(product);
- Vc_REDUCTION_FUNCTION_(sum);
- #undef Vc_REDUCTION_FUNCTION_
- Vc_INTRINSIC Vc_PURE fixed_size_simd<T, N> partialSum() const
- {
- return {private_init, data.partialSum()};
- }
- template <typename F> Vc_INTRINSIC fixed_size_simd<T, N> apply(F &&f) const
- {
- return {private_init, data.apply(std::forward<F>(f))};
- }
- template <typename F> Vc_INTRINSIC fixed_size_simd<T, N> apply(F &&f, const mask_type &k) const
- {
- return {private_init, data.apply(std::forward<F>(f), k)};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> shifted(int amount) const
- {
- return {private_init, data.shifted(amount)};
- }
- template <std::size_t NN>
- Vc_INTRINSIC fixed_size_simd<T, N> shifted(int amount, const SimdArray<value_type, NN> &shiftIn)
- const
- {
- return {private_init, data.shifted(amount, simd_cast<VectorType>(shiftIn))};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> rotated(int amount) const
- {
- return {private_init, data.rotated(amount)};
- }
- Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC fixed_size_simd<T, N> exponent() const
- {
- return {private_init, exponent(data)};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> interleaveLow(SimdArray x) const
- {
- return {private_init, data.interleaveLow(x.data)};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> interleaveHigh(SimdArray x) const
- {
- return {private_init, data.interleaveHigh(x.data)};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> reversed() const
- {
- return {private_init, data.reversed()};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> sorted() const
- {
- return {private_init, data.sorted()};
- }
- template <class G, class = decltype(std::declval<G>()(std::size_t())),
- class = enable_if<!Traits::is_simd_vector<G>::value>>
- Vc_INTRINSIC SimdArray(const G &gen) : data(gen)
- {
- }
- template <typename G> static Vc_INTRINSIC fixed_size_simd<T, N> generate(const G &gen)
- {
- return {private_init, VectorType::generate(gen)};
- }
- Vc_DEPRECATED("use copysign(x, y) instead")
- Vc_INTRINSIC fixed_size_simd<T, N> copySign(const SimdArray &x) const
- {
- return {private_init, Vc::copysign(data, x.data)};
- }
- friend VectorType &internal_data<>(SimdArray &x);
- friend const VectorType &internal_data<>(const SimdArray &x);
- Vc_INTRINSIC SimdArray(private_init_t, VectorType &&x) : data(std::move(x)) {}
- Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type));
- private:
- alignas(static_cast<std::size_t>(
- Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(VectorType_) /
- VectorType_::size()>::value)) storage_type data;
- };
- template <typename T, std::size_t N, typename VectorType> constexpr std::size_t SimdArray<T, N, VectorType, N>::Size;
- template <typename T, std::size_t N, typename VectorType>
- constexpr std::size_t SimdArray<T, N, VectorType, N>::MemoryAlignment;
- template <typename T, std::size_t N, typename VectorType>
- #ifndef Vc_MSVC
- Vc_INTRINSIC
- #endif
- VectorType &internal_data(SimdArray<T, N, VectorType, N> &x)
- {
- return x.data;
- }
- template <typename T, std::size_t N, typename VectorType>
- #ifndef Vc_MSVC
- Vc_INTRINSIC
- #endif
- const VectorType &internal_data(const SimdArray<T, N, VectorType, N> &x)
- {
- return x.data;
- }
- template <class T> Vc_INTRINSIC T unwrap(const T &x) { return x; }
- template <class T, size_t N, class V>
- Vc_INTRINSIC V unwrap(const SimdArray<T, N, V, N> &x)
- {
- return internal_data(x);
- }
- template <class T, size_t Pieces, size_t Index>
- Vc_INTRINSIC auto unwrap(const Common::Segment<T, Pieces, Index> &x)
- -> decltype(x.to_fixed_size())
- {
- return unwrap(x.to_fixed_size());
- }
- template <typename T, std::size_t N, typename VectorType>
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void SimdArray<T, N, VectorType, N>::gatherImplementation(
- const Common::GatherArguments<MT, IT, Scale> &args)
- {
- data.gather(Common::make_gather<Scale>(args.address, unwrap(args.indexes)));
- }
- template <typename T, std::size_t N, typename VectorType>
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void SimdArray<T, N, VectorType, N>::gatherImplementation(
- const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
- {
- data.gather(Common::make_gather<Scale>(args.address, unwrap(args.indexes)),
- mask);
- }
- template <typename T, std::size_t N, typename VectorType>
- template <typename MT, typename IT>
- inline void SimdArray<T, N, VectorType, N>::scatterImplementation(MT *mem,
- IT &&indexes) const
- {
- data.scatter(mem, unwrap(std::forward<IT>(indexes)));
- }
- template <typename T, std::size_t N, typename VectorType>
- template <typename MT, typename IT>
- inline void SimdArray<T, N, VectorType, N>::scatterImplementation(MT *mem,
- IT &&indexes,
- MaskArgument mask) const
- {
- data.scatter(mem, unwrap(std::forward<IT>(indexes)), mask);
- }
- template <typename T, size_t N, typename V, size_t Wt> class SimdArray
- {
- static_assert(std::is_same<T, double>::value ||
- std::is_same<T, float>::value ||
- std::is_same<T, int32_t>::value ||
- std::is_same<T, uint32_t>::value ||
- std::is_same<T, int16_t>::value ||
- std::is_same<T, uint16_t>::value, "SimdArray<T, N> may only be used with T = { double, float, int32_t, uint32_t, int16_t, uint16_t }");
- static_assert(
- std::is_same<V, typename Common::select_best_vector_type<T, N>::type>::value &&
- V::size() == Wt,
- "ERROR: leave the third and fourth template parameters with their defaults. They "
- "are implementation details.");
- static_assert(
- std::is_same<typename V::EntryType, typename V::VectorEntryType>::value ||
- (N % V::size() == 0),
- "SimdArray<(un)signed short, N> on MIC only works correctly for N = k * "
- "MIC::(u)short_v::size(), i.e. k * 16.");
- using my_traits = SimdArrayTraits<T, N>;
- static constexpr std::size_t N0 = my_traits::N0;
- static constexpr std::size_t N1 = my_traits::N1;
- using Split = Common::Split<N0>;
- template <typename U, std::size_t K> using CArray = U[K];
- public:
- static constexpr bool is_atomic = false;
- using storage_type0 = typename my_traits::storage_type0;
- using storage_type1 = typename my_traits::storage_type1;
- static_assert(storage_type0::size() == N0, "");
- using vector_type = V;
- using vectorentry_type = typename storage_type0::vectorentry_type;
- typedef vectorentry_type alias_type Vc_MAY_ALIAS;
- using value_type = T;
- using mask_type = fixed_size_simd_mask<T, N>;
- using index_type = fixed_size_simd<int, N>;
- static constexpr std::size_t size() { return N; }
- using Mask = mask_type;
- using MaskType = Mask;
- using MaskArgument = const MaskType &;
- using VectorEntryType = vectorentry_type;
- using EntryType = value_type;
- using IndexType = index_type;
- using AsArg = const SimdArray &;
- using reference = Detail::ElementReference<SimdArray>;
- static constexpr std::size_t MemoryAlignment =
- storage_type0::MemoryAlignment > storage_type1::MemoryAlignment
- ? storage_type0::MemoryAlignment
- : storage_type1::MemoryAlignment;
- static Vc_INTRINSIC fixed_size_simd<T, N> Zero()
- {
- return SimdArray(Vc::Zero);
- }
- static Vc_INTRINSIC fixed_size_simd<T, N> One()
- {
- return SimdArray(Vc::One);
- }
- static Vc_INTRINSIC fixed_size_simd<T, N> IndexesFromZero()
- {
- return SimdArray(Vc::IndexesFromZero);
- }
- static Vc_INTRINSIC fixed_size_simd<T, N> Random()
- {
- return fromOperation(Common::Operations::random());
- }
- template <class G, class = decltype(std::declval<G>()(std::size_t())),
- class = enable_if<!Traits::is_simd_vector<G>::value>>
- Vc_INTRINSIC SimdArray(const G &gen)
- : data0(gen), data1([&](std::size_t i) { return gen(i + storage_type0::size()); })
- {
- }
- template <typename G> static Vc_INTRINSIC fixed_size_simd<T, N> generate(const G &gen)
- {
- auto tmp = storage_type0::generate(gen);
- return {std::move(tmp),
- storage_type1::generate([&](std::size_t i) { return gen(i + N0); })};
- }
- SimdArray() = default;
- Vc_INTRINSIC SimdArray(value_type a) : data0(a), data1(a) {}
- template <
- typename U,
- typename = enable_if<std::is_same<U, int>::value && !std::is_same<int, value_type>::value>>
- SimdArray(U a)
- : SimdArray(static_cast<value_type>(a))
- {
- }
- SimdArray(const SimdArray &) = default;
- SimdArray(SimdArray &&) = default;
- SimdArray &operator=(const SimdArray &) = default;
- template <typename U, typename Flags = DefaultLoadTag,
- typename = enable_if<std::is_arithmetic<U>::value &&
- Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC SimdArray(const U *mem, Flags f = Flags())
- : data0(mem, f), data1(mem + storage_type0::size(), f)
- {
- }
- #ifndef Vc_MSVC
- template <typename U, std::size_t Extent, typename Flags = DefaultLoadTag,
- typename = enable_if<std::is_arithmetic<U>::value &&
- Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC SimdArray(CArray<U, Extent> &mem, Flags f = Flags())
- : data0(&mem[0], f), data1(&mem[storage_type0::size()], f)
- {
- }
- template <typename U, std::size_t Extent, typename Flags = DefaultLoadTag,
- typename = enable_if<std::is_arithmetic<U>::value &&
- Traits::is_load_store_flag<Flags>::value>>
- explicit Vc_INTRINSIC SimdArray(const CArray<U, Extent> &mem, Flags f = Flags())
- : data0(&mem[0], f), data1(&mem[storage_type0::size()], f)
- {
- }
- #endif
- Vc_INTRINSIC SimdArray(const std::initializer_list<value_type> &init)
- : data0(init.begin(), Vc::Unaligned)
- , data1(init.begin() + storage_type0::size(), Vc::Unaligned)
- {
- Vc_ASSERT(init.size() == size());
- }
- #ifndef Vc_CURRENT_CLASS_NAME
- #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
- #endif
- private:
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
- template <class MT, class IT, int Scale = 1>
- inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
- MaskArgument mask);
- public:
- #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<MT, EntryType>::value, \
- "The memory pointer needs to point to a type that can be converted to the " \
- "EntryType of this SIMD vector type."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT, typename IT,
- typename = enable_if<Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
- }
- template <typename MT, typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(
- Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args);
- }
- template <class MT, class IT, int Scale>
- Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
- MaskArgument mask)
- {
- Vc_ASSERT_GATHER_PARAMETER_TYPES_;
- gatherImplementation(args, mask);
- }
- #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
- private:
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes) const;
- template <typename MT, typename IT>
- inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
- public:
- #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
- static_assert( \
- std::is_convertible<EntryType, MT>::value, \
- "The memory pointer needs to point to a type that the EntryType of this " \
- "SIMD vector type can be converted to."); \
- static_assert( \
- Vc::Traits::has_subscript_operator<IT>::value, \
- "The indexes argument must be a type that implements the subscript operator."); \
- static_assert( \
- !Traits::is_simd_vector<IT>::value || \
- Traits::simd_vector_size<IT>::value >= Size, \
- "If you use a SIMD vector for the indexes parameter, the index vector must " \
- "have at least as many entries as this SIMD vector."); \
- static_assert( \
- !std::is_array<T>::value || \
- (std::rank<T>::value == 1 && \
- (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
- "If you use a simple array for the indexes parameter, the array must have " \
- "at least as many entries as this SIMD vector.")
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes));
- }
- template <typename MT,
- typename IT,
- typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
- Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
- {
- Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
- scatterImplementation(mem, std::forward<IT>(indexes), mask);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
- {
- scatter(args.address, args.indexes);
- }
- template <typename MT, typename IT>
- Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
- {
- scatter(args.address, args.indexes, mask);
- }
- #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
- explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerZero) : data0(), data1() {}
- explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerOne o) : data0(o), data1(o) {}
- explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerIndexesFromZero i)
- : data0(i)
- , data1(Common::AddOffset<VectorSpecialInitializerIndexesFromZero,
- storage_type0::size()>())
- {
- }
- template <size_t Offset>
- explicit Vc_INTRINSIC SimdArray(
- Common::AddOffset<VectorSpecialInitializerIndexesFromZero, Offset> i)
- : data0(i)
- , data1(Common::AddOffset<VectorSpecialInitializerIndexesFromZero,
- storage_type0::size() + Offset>())
- {
- }
- template <class W, class = enable_if<
- (Traits::is_simd_vector<W>::value &&
- Traits::simd_vector_size<W>::value == N &&
- !(std::is_convertible<Traits::entry_type_of<W>, T>::value &&
- Traits::isSimdArray<W>::value))>>
- Vc_INTRINSIC explicit SimdArray(W &&x) : data0(Split::lo(x)), data1(Split::hi(x))
- {
- }
- template <class W, class = enable_if<
- (Traits::isSimdArray<W>::value &&
- Traits::simd_vector_size<W>::value == N &&
- std::is_convertible<Traits::entry_type_of<W>, T>::value)>,
- class = W>
- Vc_INTRINSIC SimdArray(W &&x) : data0(Split::lo(x)), data1(Split::hi(x))
- {
- }
- template <class W, std::size_t Pieces, std::size_t Index>
- Vc_INTRINSIC SimdArray(Common::Segment<W, Pieces, Index> &&x)
- : data0(Common::Segment<W, 2 * Pieces, 2 * Index>{x.data})
- , data1(Common::Segment<W, 2 * Pieces, 2 * Index + 1>{x.data})
- {
- }
- template <typename U, typename A,
- typename =
- enable_if<std::is_convertible<T, U>::value && Vector<U, A>::Size == N &&
- !std::is_same<A, simd_abi::fixed_size<N>>::value>>
- operator Vector<U, A>() const
- {
- auto r = simd_cast<Vector<U, A>>(data0, data1);
- return r;
- }
- Vc_INTRINSIC operator fixed_size_simd<T, N> &()
- {
- return static_cast<fixed_size_simd<T, N> &>(*this);
- }
- Vc_INTRINSIC operator const fixed_size_simd<T, N> &() const
- {
- return static_cast<const fixed_size_simd<T, N> &>(*this);
- }
- Vc_INTRINSIC void setZero()
- {
- data0.setZero();
- data1.setZero();
- }
- Vc_INTRINSIC void setZero(const mask_type &k)
- {
- data0.setZero(Split::lo(k));
- data1.setZero(Split::hi(k));
- }
- Vc_INTRINSIC void setZeroInverted()
- {
- data0.setZeroInverted();
- data1.setZeroInverted();
- }
- Vc_INTRINSIC void setZeroInverted(const mask_type &k)
- {
- data0.setZeroInverted(Split::lo(k));
- data1.setZeroInverted(Split::hi(k));
- }
- Vc_INTRINSIC void setQnan() {
- data0.setQnan();
- data1.setQnan();
- }
- Vc_INTRINSIC void setQnan(const mask_type &m) {
- data0.setQnan(Split::lo(m));
- data1.setQnan(Split::hi(m));
- }
- template <typename Op, typename... Args>
- static Vc_INTRINSIC fixed_size_simd<T, N> fromOperation(Op op, Args &&... args)
- {
- fixed_size_simd<T, N> r = {
- storage_type0::fromOperation(op, Split::lo(args)...),
- storage_type1::fromOperation(op, Split::hi(std::forward<Args>(args))...)};
- return r;
- }
- template <typename Op, typename... Args>
- static Vc_INTRINSIC void callOperation(Op op, Args &&... args)
- {
- storage_type0::callOperation(op, Split::lo(args)...);
- storage_type1::callOperation(op, Split::hi(std::forward<Args>(args))...);
- }
- template <typename U, typename... Args> Vc_INTRINSIC void load(const U *mem, Args &&... args)
- {
- data0.load(mem, Split::lo(args)...);
- data1.load(mem + storage_type0::size(), Split::hi(std::forward<Args>(args))...);
- }
- template <typename U, typename... Args> Vc_INTRINSIC void store(U *mem, Args &&... args) const
- {
- data0.store(mem, Split::lo(args)...);
- data1.store(mem + storage_type0::size(), Split::hi(std::forward<Args>(args))...);
- }
- Vc_INTRINSIC mask_type operator!() const
- {
- return {!data0, !data1};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> operator-() const
- {
- return {-data0, -data1};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> operator+() const { return *this; }
- Vc_INTRINSIC fixed_size_simd<T, N> operator~() const
- {
- return {~data0, ~data1};
- }
- template <typename U,
- typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
- Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator<<(U x) const
- {
- return {data0 << x, data1 << x};
- }
- template <typename U,
- typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
- Vc_INTRINSIC fixed_size_simd<T, N> &operator<<=(U x)
- {
- data0 <<= x;
- data1 <<= x;
- return *this;
- }
- template <typename U,
- typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
- Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator>>(U x) const
- {
- return {data0 >> x, data1 >> x};
- }
- template <typename U,
- typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
- Vc_INTRINSIC fixed_size_simd<T, N> &operator>>=(U x)
- {
- data0 >>= x;
- data1 >>= x;
- return *this;
- }
- #define Vc_BINARY_OPERATOR_(op) \
- Vc_INTRINSIC fixed_size_simd<T, N> &operator op##=(const SimdArray &rhs) \
- { \
- data0 op## = rhs.data0; \
- data1 op## = rhs.data1; \
- return *this; \
- }
- Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_);
- Vc_ALL_BINARY(Vc_BINARY_OPERATOR_);
- Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_);
- #undef Vc_BINARY_OPERATOR_
- private:
- friend reference;
- Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept
- {
- return reinterpret_cast<const alias_type *>(&o)[i];
- }
- template <typename U>
- Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept(
- noexcept(std::declval<value_type &>() = v))
- {
- reinterpret_cast<alias_type *>(&o)[i] = v;
- }
- public:
- Vc_INTRINSIC reference operator[](size_t i) noexcept
- {
- static_assert(noexcept(reference{std::declval<SimdArray &>(), int()}), "");
- return {*this, int(i)};
- }
- Vc_INTRINSIC value_type operator[](size_t index) const noexcept
- {
- return get(*this, int(index));
- }
- Vc_INTRINSIC Common::WriteMaskedVector<SimdArray, mask_type> operator()(
- const mask_type &mask)
- {
- return {*this, mask};
- }
- Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k)
- {
- data0.assign(v.data0, internal_data0(k));
- data1.assign(v.data1, internal_data1(k));
- }
- #define Vc_REDUCTION_FUNCTION_(name_,binary_fun_,scalar_fun_) \
- private: \
- template <typename ForSfinae = void> \
- Vc_INTRINSIC enable_if<std::is_same<ForSfinae, void>::value && \
- storage_type0::Size == storage_type1::Size, \
- value_type> name_##_impl() const \
- { \
- return binary_fun_(data0, data1).name_(); \
- } \
- \
- template <typename ForSfinae = void> \
- Vc_INTRINSIC enable_if<std::is_same<ForSfinae, void>::value && \
- storage_type0::Size != storage_type1::Size, \
- value_type> name_##_impl() const \
- { \
- return scalar_fun_(data0.name_(), data1.name_()); \
- } \
- \
- public: \
- \
- Vc_INTRINSIC value_type name_() const { return name_##_impl(); } \
- \
- Vc_INTRINSIC value_type name_(const mask_type &mask) const \
- { \
- if (Vc_IS_UNLIKELY(Split::lo(mask).isEmpty())) { \
- return data1.name_(Split::hi(mask)); \
- } else if (Vc_IS_UNLIKELY(Split::hi(mask).isEmpty())) { \
- return data0.name_(Split::lo(mask)); \
- } else { \
- return scalar_fun_(data0.name_(Split::lo(mask)), \
- data1.name_(Split::hi(mask))); \
- } \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_REDUCTION_FUNCTION_(min, Vc::min, std::min);
- Vc_REDUCTION_FUNCTION_(max, Vc::max, std::max);
- Vc_REDUCTION_FUNCTION_(product, internal::product_helper_, internal::product_helper_);
- Vc_REDUCTION_FUNCTION_(sum, internal::sum_helper_, internal::sum_helper_);
- #undef Vc_REDUCTION_FUNCTION_
- Vc_INTRINSIC Vc_PURE fixed_size_simd<T, N> partialSum() const
- {
- auto ps0 = data0.partialSum();
- auto tmp = data1;
- tmp[0] += ps0[data0.size() - 1];
- return {std::move(ps0), tmp.partialSum()};
- }
- template <typename F> inline fixed_size_simd<T, N> apply(F &&f) const
- {
- return {data0.apply(f), data1.apply(f)};
- }
- template <typename F>
- inline fixed_size_simd<T, N> apply(F &&f, const mask_type &k) const
- {
- return {data0.apply(f, Split::lo(k)), data1.apply(f, Split::hi(k))};
- }
- inline fixed_size_simd<T, N> shifted(int amount) const
- {
- constexpr int SSize = Size;
- constexpr int SSize0 = storage_type0::Size;
- constexpr int SSize1 = storage_type1::Size;
- if (amount == 0) {
- return *this;
- }
- if (amount < 0) {
- if (amount > -SSize0) {
- return {data0.shifted(amount), data1.shifted(amount, data0)};
- }
- if (amount == -SSize0) {
- return {storage_type0(0), simd_cast<storage_type1>(data0)};
- }
- if (amount < -SSize0) {
- return {storage_type0(0), simd_cast<storage_type1>(data0.shifted(
- amount + SSize0))};
- }
- return Zero();
- } else {
- if (amount >= SSize) {
- return Zero();
- } else if (amount >= SSize0) {
- return {
- simd_cast<storage_type0>(data1).shifted(amount - SSize0),
- storage_type1(0)};
- } else if (amount >= SSize1) {
- return {data0.shifted(amount, data1), storage_type1(0)};
- } else {
- return {data0.shifted(amount, data1), data1.shifted(amount)};
- }
- }
- }
- template <std::size_t NN>
- inline enable_if<
- !(std::is_same<storage_type0, storage_type1>::value &&
- N == NN),
- fixed_size_simd<T, N>>
- shifted(int amount, const SimdArray<value_type, NN> &shiftIn) const
- {
- constexpr int SSize = Size;
- if (amount < 0) {
- return fixed_size_simd<T, N>([&](int i) -> value_type {
- i += amount;
- if (i >= 0) {
- return operator[](i);
- } else if (i >= -SSize) {
- return shiftIn[i + SSize];
- }
- return 0;
- });
- }
- return fixed_size_simd<T, N>([&](int i) -> value_type {
- i += amount;
- if (i < SSize) {
- return operator[](i);
- } else if (i < 2 * SSize) {
- return shiftIn[i - SSize];
- }
- return 0;
- });
- }
- private:
- template <std::size_t NN> struct bisectable_shift
- : public std::integral_constant<bool,
- std::is_same<storage_type0, storage_type1>::value &&
- N == NN>
- {
- };
- public:
- template <std::size_t NN>
- inline fixed_size_simd<T, N> shifted(
- enable_if<bisectable_shift<NN>::value, int> amount,
- const SimdArray<value_type, NN> &shiftIn) const
- {
- constexpr int SSize = Size;
- if (amount < 0) {
- if (amount > -static_cast<int>(storage_type0::Size)) {
- return {data0.shifted(amount, internal_data1(shiftIn)),
- data1.shifted(amount, data0)};
- }
- if (amount == -static_cast<int>(storage_type0::Size)) {
- return {storage_type0(internal_data1(shiftIn)), storage_type1(data0)};
- }
- if (amount > -SSize) {
- return {
- internal_data1(shiftIn)
- .shifted(amount + static_cast<int>(storage_type0::Size), internal_data0(shiftIn)),
- data0.shifted(amount + static_cast<int>(storage_type0::Size), internal_data1(shiftIn))};
- }
- if (amount == -SSize) {
- return shiftIn;
- }
- if (amount > -2 * SSize) {
- return shiftIn.shifted(amount + SSize);
- }
- }
- if (amount == 0) {
- return *this;
- }
- if (amount < static_cast<int>(storage_type0::Size)) {
- return {data0.shifted(amount, data1),
- data1.shifted(amount, internal_data0(shiftIn))};
- }
- if (amount == static_cast<int>(storage_type0::Size)) {
- return {storage_type0(data1), storage_type1(internal_data0(shiftIn))};
- }
- if (amount < SSize) {
- return {data1.shifted(amount - static_cast<int>(storage_type0::Size), internal_data0(shiftIn)),
- internal_data0(shiftIn)
- .shifted(amount - static_cast<int>(storage_type0::Size), internal_data1(shiftIn))};
- }
- if (amount == SSize) {
- return shiftIn;
- }
- if (amount < 2 * SSize) {
- return shiftIn.shifted(amount - SSize);
- }
- return Zero();
- }
- Vc_INTRINSIC fixed_size_simd<T, N> rotated(int amount) const
- {
- amount %= int(size());
- if (amount == 0) {
- return *this;
- } else if (amount < 0) {
- amount += size();
- }
- #ifdef Vc_MSVC
- alignas(MemoryAlignment) T tmp[N + data0.size()];
- data0.store(&tmp[0], Vc::Aligned);
- data1.store(&tmp[data0.size()], Vc::Aligned);
- data0.store(&tmp[N], Vc::Unaligned);
- fixed_size_simd<T, N> r;
- r.data0.load(&tmp[amount], Vc::Unaligned);
- r.data1.load(&tmp[(amount + data0.size()) % size()], Vc::Unaligned);
- return r;
- #else
- auto &&d0cvtd = simd_cast<storage_type1>(data0);
- auto &&d1cvtd = simd_cast<storage_type0>(data1);
- constexpr int size0 = storage_type0::size();
- constexpr int size1 = storage_type1::size();
- if (amount == size0 && std::is_same<storage_type0, storage_type1>::value) {
- return {std::move(d1cvtd), std::move(d0cvtd)};
- } else if (amount < size1) {
- return {data0.shifted(amount, d1cvtd), data1.shifted(amount, d0cvtd)};
- } else if (amount == size1) {
- return {data0.shifted(amount, d1cvtd), std::move(d0cvtd)};
- } else if (int(size()) - amount < size1) {
- return {data0.shifted(amount - int(size()), d1cvtd.shifted(size1 - size0)),
- data1.shifted(amount - int(size()), data0.shifted(size0 - size1))};
- } else if (int(size()) - amount == size1) {
- return {data0.shifted(-size1, d1cvtd.shifted(size1 - size0)),
- simd_cast<storage_type1>(data0.shifted(size0 - size1))};
- } else if (amount <= size0) {
- return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0),
- simd_cast<storage_type1>(data0.shifted(amount - size1))};
- } else {
- return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0),
- simd_cast<storage_type1>(data0.shifted(amount - size1, d1cvtd))};
- }
- return *this;
- #endif
- }
- Vc_INTRINSIC fixed_size_simd<T, N> interleaveLow(const SimdArray &x) const
- {
- return {data0.interleaveLow(x.data0),
- simd_cast<storage_type1>(data0.interleaveHigh(x.data0))};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> interleaveHigh(const SimdArray &x) const
- {
- return interleaveHighImpl(
- x,
- std::integral_constant<bool, storage_type0::Size == storage_type1::Size>());
- }
- private:
- Vc_INTRINSIC fixed_size_simd<T, N> interleaveHighImpl(const SimdArray &x, std::true_type) const
- {
- return {data1.interleaveLow(x.data1), data1.interleaveHigh(x.data1)};
- }
- inline fixed_size_simd<T, N> interleaveHighImpl(const SimdArray &x, std::false_type) const
- {
- return {data0.interleaveHigh(x.data0)
- .shifted(storage_type1::Size,
- simd_cast<storage_type0>(data1.interleaveLow(x.data1))),
- data1.interleaveHigh(x.data1)};
- }
- public:
- inline fixed_size_simd<T, N> reversed() const
- {
- if (std::is_same<storage_type0, storage_type1>::value) {
- return {simd_cast<storage_type0>(data1).reversed(),
- simd_cast<storage_type1>(data0).reversed()};
- } else {
- #ifdef Vc_MSVC
- alignas(MemoryAlignment) T tmp[N];
- data1.reversed().store(&tmp[0], Vc::Aligned);
- data0.reversed().store(&tmp[data1.size()], Vc::Unaligned);
- return fixed_size_simd<T, N>{&tmp[0], Vc::Aligned};
- #else
- return {data0.shifted(storage_type1::Size, data1).reversed(),
- simd_cast<storage_type1>(data0.reversed().shifted(
- storage_type0::Size - storage_type1::Size))};
- #endif
- }
- }
- inline fixed_size_simd<T, N> sorted() const
- {
- return sortedImpl(
- std::integral_constant<bool, storage_type0::Size == storage_type1::Size>());
- }
- Vc_INTRINSIC fixed_size_simd<T, N> sortedImpl(std::true_type) const
- {
- #ifdef Vc_DEBUG_SORTED
- std::cerr << "-- " << data0 << data1 << '\n';
- #endif
- const auto a = data0.sorted();
- const auto b = data1.sorted().reversed();
- const auto lo = Vc::min(a, b);
- const auto hi = Vc::max(a, b);
- return {lo.sorted(), hi.sorted()};
- }
- Vc_INTRINSIC fixed_size_simd<T, N> sortedImpl(std::false_type) const
- {
- using SortableArray =
- fixed_size_simd<value_type, Common::NextPowerOfTwo<size()>::value>;
- auto sortable = simd_cast<SortableArray>(*this);
- for (std::size_t i = Size; i < SortableArray::Size; ++i) {
- using limits = std::numeric_limits<value_type>;
- if (limits::has_infinity) {
- sortable[i] = limits::infinity();
- } else {
- sortable[i] = std::numeric_limits<value_type>::max();
- }
- }
- return simd_cast<fixed_size_simd<T, N>>(sortable.sorted());
- }
- static constexpr std::size_t Size = size();
- Vc_DEPRECATED("use exponent(x) instead")
- Vc_INTRINSIC fixed_size_simd<T, N> exponent() const
- {
- return {exponent(data0), exponent(data1)};
- }
- Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const
- {
- return {isnegative(data0), isnegative(data1)};
- }
- Vc_DEPRECATED("use copysign(x, y) instead")
- Vc_INTRINSIC fixed_size_simd<T, N> copySign(const SimdArray &x) const
- {
- return {Vc::copysign(data0, x.data0),
- Vc::copysign(data1, x.data1)};
- }
- friend storage_type0 &internal_data0<>(SimdArray &x);
- friend storage_type1 &internal_data1<>(SimdArray &x);
- friend const storage_type0 &internal_data0<>(const SimdArray &x);
- friend const storage_type1 &internal_data1<>(const SimdArray &x);
- Vc_INTRINSIC SimdArray(storage_type0 &&x, storage_type1 &&y)
- : data0(std::move(x)), data1(std::move(y))
- {
- }
- Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type0));
- private:
- alignas(static_cast<std::size_t>(
- Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(V) /
- V::size()>::value)) storage_type0 data0;
- storage_type1 data1;
- };
- #undef Vc_CURRENT_CLASS_NAME
- template <typename T, std::size_t N, typename V, std::size_t M>
- constexpr std::size_t SimdArray<T, N, V, M>::Size;
- template <typename T, std::size_t N, typename V, std::size_t M>
- constexpr std::size_t SimdArray<T, N, V, M>::MemoryAlignment;
- template <typename T, std::size_t N, typename VectorType, std::size_t M>
- template <class MT, class IT, int Scale>
- inline void SimdArray<T, N, VectorType, M>::gatherImplementation(
- const Common::GatherArguments<MT, IT, Scale> &args)
- {
- data0.gather(Common::make_gather<Scale>(
- args.address, Split::lo(Common::Operations::gather(), args.indexes)));
- data1.gather(Common::make_gather<Scale>(
- args.address, Split::hi(Common::Operations::gather(), args.indexes)));
- }
- template <typename T, std::size_t N, typename VectorType, std::size_t M>
- template <class MT, class IT, int Scale>
- inline void SimdArray<T, N, VectorType, M>::gatherImplementation(
- const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
- {
- data0.gather(Common::make_gather<Scale>(
- args.address, Split::lo(Common::Operations::gather(), args.indexes)),
- Split::lo(mask));
- data1.gather(Common::make_gather<Scale>(
- args.address, Split::hi(Common::Operations::gather(), args.indexes)),
- Split::hi(mask));
- }
- template <typename T, std::size_t N, typename VectorType, std::size_t M>
- template <typename MT, typename IT>
- inline void SimdArray<T, N, VectorType, M>::scatterImplementation(MT *mem,
- IT &&indexes) const
- {
- data0.scatter(mem, Split::lo(Common::Operations::gather(),
- indexes));
- data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward<IT>(indexes)));
- }
- template <typename T, std::size_t N, typename VectorType, std::size_t M>
- template <typename MT, typename IT>
- inline void SimdArray<T, N, VectorType, M>::scatterImplementation(MT *mem,
- IT &&indexes, MaskArgument mask) const
- {
- data0.scatter(mem, Split::lo(Common::Operations::gather(), indexes),
- Split::lo(mask));
- data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward<IT>(indexes)),
- Split::hi(mask));
- }
- template <typename T, std::size_t N, typename V, std::size_t M>
- #ifndef Vc_MSVC
- Vc_INTRINSIC
- #endif
- typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
- SimdArray<T, N, V, M> &x)
- {
- return x.data0;
- }
- template <typename T, std::size_t N, typename V, std::size_t M>
- #ifndef Vc_MSVC
- Vc_INTRINSIC
- #endif
- typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
- SimdArray<T, N, V, M> &x)
- {
- return x.data1;
- }
- template <typename T, std::size_t N, typename V, std::size_t M>
- #ifndef Vc_MSVC
- Vc_INTRINSIC
- #endif
- const typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
- const SimdArray<T, N, V, M> &x)
- {
- return x.data0;
- }
- template <typename T, std::size_t N, typename V, std::size_t M>
- #ifndef Vc_MSVC
- Vc_INTRINSIC
- #endif
- const typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
- const SimdArray<T, N, V, M> &x)
- {
- return x.data1;
- }
- #if defined Vc_MSVC && defined Vc_IMPL_SSE && !defined Vc_IMPL_AVX
- template <>
- Vc_INTRINSIC SimdArray<double, 8>::SimdArray(fixed_size_simd<double, 4> &&x,
- fixed_size_simd<double, 4> &&y)
- : data0(x), data1(0)
- {
- data1 = y;
- }
- #endif
- namespace Detail
- {
- #define Vc_FIXED_OP(op) \
- template <class T, int N, \
- class = typename std::enable_if<fixed_size_simd<T, N>::is_atomic>::type> \
- fixed_size_simd<T, N> operator op(const fixed_size_simd<T, N> &a, \
- const fixed_size_simd<T, N> &b) \
- { \
- return {private_init, internal_data(a) op internal_data(b)}; \
- } \
- template <class T, int N, \
- class = typename std::enable_if<!fixed_size_simd<T, N>::is_atomic>::type, \
- class = T> \
- fixed_size_simd<T, N> operator op(const fixed_size_simd<T, N> &a, \
- const fixed_size_simd<T, N> &b) \
- { \
- return {internal_data0(a) op internal_data0(b), \
- internal_data1(a) op internal_data1(b)}; \
- }
- Vc_ALL_ARITHMETICS(Vc_FIXED_OP);
- Vc_ALL_BINARY(Vc_FIXED_OP);
- Vc_ALL_SHIFTS(Vc_FIXED_OP);
- #undef Vc_FIXED_OP
- #define Vc_FIXED_OP(op) \
- template <class T, int N, \
- class = typename std::enable_if<fixed_size_simd<T, N>::is_atomic>::type> \
- fixed_size_simd_mask<T, N> operator op(const fixed_size_simd<T, N> &a, \
- const fixed_size_simd<T, N> &b) \
- { \
- return {private_init, internal_data(a) op internal_data(b)}; \
- } \
- template <class T, int N, \
- class = typename std::enable_if<!fixed_size_simd<T, N>::is_atomic>::type, \
- class = T> \
- fixed_size_simd_mask<T, N> operator op(const fixed_size_simd<T, N> &a, \
- const fixed_size_simd<T, N> &b) \
- { \
- return {internal_data0(a) op internal_data0(b), \
- internal_data1(a) op internal_data1(b)}; \
- }
- Vc_ALL_COMPARES(Vc_FIXED_OP);
- #undef Vc_FIXED_OP
- }
- namespace result_vector_type_internal
- {
- template <typename T>
- using remove_cvref = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
- template <typename T>
- using is_integer_larger_than_int = std::integral_constant<
- bool, std::is_integral<T>::value &&(sizeof(T) > sizeof(int) ||
- std::is_same<T, long>::value ||
- std::is_same<T, unsigned long>::value)>;
- template <
- typename L, typename R,
- std::size_t N = Traits::isSimdArray<L>::value ? Traits::simd_vector_size<L>::value
- : Traits::simd_vector_size<R>::value,
- bool = (Traits::isSimdArray<L>::value ||
- Traits::isSimdArray<R>::value) &&
- !(Traits::is_fixed_size_simd<L>::value &&
- Traits::is_fixed_size_simd<R>::value) &&
- ((std::is_arithmetic<remove_cvref<L>>::value &&
- !is_integer_larger_than_int<remove_cvref<L>>::value) ||
- (std::is_arithmetic<remove_cvref<R>>::value &&
- !is_integer_larger_than_int<remove_cvref<R>>::value) ||
- Traits::simd_vector_size<L>::value == Traits::simd_vector_size<R>::value)>
- struct evaluate;
- template <typename L, typename R, std::size_t N> struct evaluate<L, R, N, true>
- {
- private:
- using LScalar = Traits::entry_type_of<L>;
- using RScalar = Traits::entry_type_of<R>;
- template <bool B, typename T, typename F>
- using conditional = typename std::conditional<B, T, F>::type;
- public:
- using type = fixed_size_simd<
- conditional<(std::is_integral<LScalar>::value &&std::is_integral<RScalar>::value &&
- sizeof(LScalar) < sizeof(int) &&
- sizeof(RScalar) < sizeof(int)),
- conditional<(sizeof(LScalar) == sizeof(RScalar)),
- conditional<std::is_unsigned<LScalar>::value, LScalar, RScalar>,
- conditional<(sizeof(LScalar) > sizeof(RScalar)), LScalar, RScalar>>,
- decltype(std::declval<LScalar>() + std::declval<RScalar>())>,
- N>;
- };
- }
- template <typename L, typename R>
- using result_vector_type = typename result_vector_type_internal::evaluate<L, R>::type;
- #define Vc_BINARY_OPERATORS_(op_) \
- \
- template <typename L, typename R> \
- Vc_INTRINSIC result_vector_type<L, R> operator op_(L &&lhs, R &&rhs) \
- { \
- using Return = result_vector_type<L, R>; \
- return Vc::Detail::operator op_( \
- static_cast<const Return &>(std::forward<L>(lhs)), \
- static_cast<const Return &>(std::forward<R>(rhs))); \
- }
- Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATORS_);
- Vc_ALL_BINARY(Vc_BINARY_OPERATORS_);
- #undef Vc_BINARY_OPERATORS_
- #define Vc_BINARY_OPERATORS_(op_) \
- \
- template <typename L, typename R> \
- Vc_INTRINSIC typename result_vector_type<L, R>::mask_type operator op_(L &&lhs, \
- R &&rhs) \
- { \
- using Promote = result_vector_type<L, R>; \
- return Promote(std::forward<L>(lhs)) op_ Promote(std::forward<R>(rhs)); \
- }
- Vc_ALL_COMPARES(Vc_BINARY_OPERATORS_);
- #undef Vc_BINARY_OPERATORS_
- #define Vc_FORWARD_UNARY_OPERATOR(name_) \
- \
- template <typename T, std::size_t N, typename V, std::size_t M> \
- inline fixed_size_simd<T, N> name_(const SimdArray<T, N, V, M> &x) \
- { \
- return fixed_size_simd<T, N>::fromOperation( \
- Common::Operations::Forward_##name_(), x); \
- } \
- template <class T, int N> \
- fixed_size_simd<T, N> name_(const fixed_size_simd<T, N> &x) \
- { \
- return fixed_size_simd<T, N>::fromOperation( \
- Common::Operations::Forward_##name_(), x); \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- #define Vc_FORWARD_UNARY_BOOL_OPERATOR(name_) \
- \
- template <typename T, std::size_t N, typename V, std::size_t M> \
- inline fixed_size_simd_mask<T, N> name_(const SimdArray<T, N, V, M> &x) \
- { \
- return fixed_size_simd_mask<T, N>::fromOperation( \
- Common::Operations::Forward_##name_(), x); \
- } \
- template <class T, int N> \
- fixed_size_simd_mask<T, N> name_(const fixed_size_simd<T, N> &x) \
- { \
- return fixed_size_simd_mask<T, N>::fromOperation( \
- Common::Operations::Forward_##name_(), x); \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- #define Vc_FORWARD_BINARY_OPERATOR(name_) \
- \
- template <typename T, std::size_t N, typename V, std::size_t M> \
- inline fixed_size_simd<T, N> name_(const SimdArray<T, N, V, M> &x, \
- const SimdArray<T, N, V, M> &y) \
- { \
- return fixed_size_simd<T, N>::fromOperation( \
- Common::Operations::Forward_##name_(), x, y); \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_FORWARD_UNARY_OPERATOR(abs);
- Vc_FORWARD_UNARY_OPERATOR(asin);
- Vc_FORWARD_UNARY_OPERATOR(atan);
- Vc_FORWARD_BINARY_OPERATOR(atan2);
- Vc_FORWARD_UNARY_OPERATOR(ceil);
- Vc_FORWARD_BINARY_OPERATOR(copysign);
- Vc_FORWARD_UNARY_OPERATOR(cos);
- Vc_FORWARD_UNARY_OPERATOR(exp);
- Vc_FORWARD_UNARY_OPERATOR(exponent);
- Vc_FORWARD_UNARY_OPERATOR(floor);
- template <typename T, std::size_t N>
- inline SimdArray<T, N> fma(const SimdArray<T, N> &a, const SimdArray<T, N> &b,
- const SimdArray<T, N> &c)
- {
- return SimdArray<T, N>::fromOperation(Common::Operations::Forward_fma(), a, b, c);
- }
- Vc_FORWARD_UNARY_BOOL_OPERATOR(isfinite);
- Vc_FORWARD_UNARY_BOOL_OPERATOR(isinf);
- Vc_FORWARD_UNARY_BOOL_OPERATOR(isnan);
- Vc_FORWARD_UNARY_BOOL_OPERATOR(isnegative);
- template <typename T, std::size_t N>
- inline SimdArray<T, N> frexp(const SimdArray<T, N> &x, SimdArray<int, N> *e)
- {
- return SimdArray<T, N>::fromOperation(Common::Operations::Forward_frexp(), x, e);
- }
- template <typename T, std::size_t N>
- inline SimdArray<T, N> ldexp(const SimdArray<T, N> &x, const SimdArray<int, N> &e)
- {
- return SimdArray<T, N>::fromOperation(Common::Operations::Forward_ldexp(), x, e);
- }
- Vc_FORWARD_UNARY_OPERATOR(log);
- Vc_FORWARD_UNARY_OPERATOR(log10);
- Vc_FORWARD_UNARY_OPERATOR(log2);
- Vc_FORWARD_UNARY_OPERATOR(reciprocal);
- Vc_FORWARD_UNARY_OPERATOR(round);
- Vc_FORWARD_UNARY_OPERATOR(rsqrt);
- Vc_FORWARD_UNARY_OPERATOR(sin);
- template <typename T, std::size_t N>
- void sincos(const SimdArray<T, N> &x, SimdArray<T, N> *sin, SimdArray<T, N> *cos)
- {
- SimdArray<T, N>::callOperation(Common::Operations::Forward_sincos(), x, sin, cos);
- }
- Vc_FORWARD_UNARY_OPERATOR(sqrt);
- Vc_FORWARD_UNARY_OPERATOR(trunc);
- Vc_FORWARD_BINARY_OPERATOR(min);
- Vc_FORWARD_BINARY_OPERATOR(max);
- #undef Vc_FORWARD_UNARY_OPERATOR
- #undef Vc_FORWARD_UNARY_BOOL_OPERATOR
- #undef Vc_FORWARD_BINARY_OPERATOR
- #ifdef Vc_MSVC
- #define Vc_DUMMY_ARG0 , int = 0
- #define Vc_DUMMY_ARG1 , long = 0
- #define Vc_DUMMY_ARG2 , short = 0
- #define Vc_DUMMY_ARG3 , char = '0'
- #define Vc_DUMMY_ARG4 , unsigned = 0u
- #define Vc_DUMMY_ARG5 , unsigned short = 0u
- #else
- #define Vc_DUMMY_ARG0
- #define Vc_DUMMY_ARG1
- #define Vc_DUMMY_ARG2
- #define Vc_DUMMY_ARG3
- #define Vc_DUMMY_ARG4
- #define Vc_DUMMY_ARG5
- #endif
- template <typename Return, std::size_t N, typename T, typename... From>
- Vc_INTRINSIC Vc_CONST enable_if<sizeof...(From) != 0, Return>
- simd_cast_impl_smaller_input(const From &... xs, const T &last)
- {
- Return r = simd_cast<Return>(xs...);
- for (size_t i = 0; i < N; ++i) {
- r[i + N * sizeof...(From)] = static_cast<typename Return::EntryType>(last[i]);
- }
- return r;
- }
- template <typename Return, std::size_t N, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast_impl_smaller_input(const T &last)
- {
- Return r = Return();
- for (size_t i = 0; i < N; ++i) {
- r[i] = static_cast<typename Return::EntryType>(last[i]);
- }
- return r;
- }
- template <typename Return, std::size_t N, typename T, typename... From>
- Vc_INTRINSIC Vc_CONST enable_if<sizeof...(From) != 0, Return> simd_cast_impl_larger_input(
- const From &... xs, const T &last)
- {
- Return r = simd_cast<Return>(xs...);
- for (size_t i = N * sizeof...(From); i < Return::Size; ++i) {
- r[i] = static_cast<typename Return::EntryType>(last[i - N * sizeof...(From)]);
- }
- return r;
- }
- template <typename Return, std::size_t N, typename T>
- Vc_INTRINSIC Vc_CONST Return simd_cast_impl_larger_input(const T &last)
- {
- Return r = Return();
- for (size_t i = 0; i < Return::size(); ++i) {
- r[i] = static_cast<typename Return::EntryType>(last[i]);
- }
- return r;
- }
- template <typename Return, typename T, typename... From>
- Vc_INTRINSIC_L Vc_CONST_L Return
- simd_cast_without_last(const From &... xs, const T &) Vc_INTRINSIC_R Vc_CONST_R;
- template <typename... Ts> struct are_all_types_equal;
- template <typename T>
- struct are_all_types_equal<T> : public std::integral_constant<bool, true>
- {
- };
- template <typename T0, typename T1, typename... Ts>
- struct are_all_types_equal<T0, T1, Ts...>
- : public std::integral_constant<
- bool, std::is_same<T0, T1>::value && are_all_types_equal<T1, Ts...>::value>
- {
- };
- template <typename Return, typename... Ts>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b);
- template <typename Return, std::size_t offset, typename From, typename... Froms>
- Vc_INTRINSIC Vc_CONST
- enable_if<(are_all_types_equal<From, Froms...>::value && offset == 0), Return>
- simd_cast_with_offset(const From &x, const Froms &... xs);
- template <typename Return, std::size_t offset, typename From>
- Vc_INTRINSIC Vc_CONST
- enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0), Return>
- simd_cast_with_offset(const From &x);
- template <typename Return, std::size_t offset, typename From>
- Vc_INTRINSIC Vc_CONST
- enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
- ((Traits::isSimdArray<Return>::value &&
- !Traits::isAtomicSimdArray<Return>::value) ||
- (Traits::isSimdMaskArray<Return>::value &&
- !Traits::isAtomicSimdMaskArray<Return>::value))),
- Return>
- simd_cast_with_offset(const From &x);
- template <typename Return, std::size_t offset, typename From>
- Vc_INTRINSIC Vc_CONST
- enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
- ((Traits::isSimdArray<Return>::value &&
- Traits::isAtomicSimdArray<Return>::value) ||
- (Traits::isSimdMaskArray<Return>::value &&
- Traits::isAtomicSimdMaskArray<Return>::value))),
- Return>
- simd_cast_with_offset(const From &x);
- template <typename Return, std::size_t offset, typename From, typename... Froms>
- Vc_INTRINSIC Vc_CONST enable_if<
- (are_all_types_equal<From, Froms...>::value && From::Size <= offset), Return>
- simd_cast_with_offset(const From &, const Froms &... xs)
- {
- return simd_cast_with_offset<Return, offset - From::Size>(xs...);
- }
- template <typename Return, std::size_t offset, typename From>
- Vc_INTRINSIC Vc_CONST enable_if<(From::Size <= offset), Return> simd_cast_with_offset(
- const From &)
- {
- return Return(0);
- }
- template <typename T, typename... Ts> struct first_type_of_impl
- {
- using type = T;
- };
- template <typename... Ts> using first_type_of = typename first_type_of_impl<Ts...>::type;
- template <typename Return, typename From>
- Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x);
- template <typename Return, typename... Froms>
- Vc_INTRINSIC Vc_CONST
- enable_if<(are_all_types_equal<Froms...>::value &&
- sizeof...(Froms) * first_type_of<Froms...>::Size < Return::Size),
- Return>
- simd_cast_drop_arguments(Froms... xs, first_type_of<Froms...> x);
- template <typename Return, typename From, typename... Froms>
- Vc_INTRINSIC Vc_CONST enable_if<
- (are_all_types_equal<From, Froms...>::value &&
- (1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0),
- Return>
- simd_cast_drop_arguments(Froms... xs, From x, From);
- template <typename Return, typename From>
- Vc_INTRINSIC Vc_CONST
- enable_if<(are_all_types_equal<From>::value && From::Size >= Return::Size), Return>
- simd_cast_drop_arguments(From x, From);
- namespace
- {
- #ifdef Vc_DEBUG_SIMD_CAST
- void debugDoNothing(const std::initializer_list<void *> &) {}
- template <typename T0, typename... Ts>
- inline void vc_debug_(const char *prefix, const char *suffix, const T0 &arg0,
- const Ts &... args)
- {
- std::cerr << prefix << arg0;
- debugDoNothing({&(std::cerr << ", " << args)...});
- std::cerr << suffix;
- }
- #else
- template <typename T0, typename... Ts>
- Vc_INTRINSIC void vc_debug_(const char *, const char *, const T0 &, const Ts &...)
- {
- }
- #endif
- }
- template <size_t A, size_t B>
- struct is_less : public std::integral_constant<bool, (A < B)> {
- };
- template <size_t N>
- struct is_power_of_2 : public std::integral_constant<bool, ((N - 1) & N) == 0> {
- };
- #define Vc_SIMDARRAY_CASTS(SimdArrayType_,NativeType_) \
- template <typename Return, typename T, typename A, typename... Froms> \
- Vc_INTRINSIC Vc_CONST enable_if< \
- (Traits::isAtomic##SimdArrayType_<Return>::value && \
- is_less<NativeType_<T, A>::Size * sizeof...(Froms), Return::Size>::value && \
- are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
- !detail::is_fixed_size_abi<A>::value), \
- Return> \
- simd_cast(NativeType_<T, A> x, Froms... xs) \
- { \
- vc_debug_("simd_cast{1}(", ")\n", x, xs...); \
- return {private_init, simd_cast<typename Return::storage_type>(x, xs...)}; \
- } \
- template <typename Return, typename T, typename A, typename... Froms> \
- Vc_INTRINSIC Vc_CONST enable_if< \
- (Traits::isAtomic##SimdArrayType_<Return>::value && \
- !is_less<NativeType_<T, A>::Size * sizeof...(Froms), Return::Size>::value && \
- are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
- !detail::is_fixed_size_abi<A>::value), \
- Return> \
- simd_cast(NativeType_<T, A> x, Froms... xs) \
- { \
- vc_debug_("simd_cast{2}(", ")\n", x, xs...); \
- return {simd_cast_without_last<Return, NativeType_<T, A>, Froms...>(x, xs...)}; \
- } \
- template <typename Return, typename T, typename A, typename... Froms> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<(Traits::is##SimdArrayType_<Return>::value && \
- !Traits::isAtomic##SimdArrayType_<Return>::value && \
- is_less<Common::left_size<Return::Size>(), \
- NativeType_<T, A>::Size *(1 + sizeof...(Froms))>::value && \
- are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
- !detail::is_fixed_size_abi<A>::value), \
- Return> \
- simd_cast(NativeType_<T, A> x, Froms... xs) \
- { \
- vc_debug_("simd_cast{3}(", ")\n", x, xs...); \
- using R0 = typename Return::storage_type0; \
- using R1 = typename Return::storage_type1; \
- return {simd_cast_drop_arguments<R0, Froms...>(x, xs...), \
- simd_cast_with_offset<R1, R0::Size>(x, xs...)}; \
- } \
- template <typename Return, typename T, typename A, typename... Froms> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<(Traits::is##SimdArrayType_<Return>::value && \
- !Traits::isAtomic##SimdArrayType_<Return>::value && \
- !is_less<Common::left_size<Return::Size>(), \
- NativeType_<T, A>::Size *(1 + sizeof...(Froms))>::value && \
- are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
- !detail::is_fixed_size_abi<A>::value), \
- Return> \
- simd_cast(NativeType_<T, A> x, Froms... xs) \
- { \
- vc_debug_("simd_cast{4}(", ")\n", x, xs...); \
- using R0 = typename Return::storage_type0; \
- using R1 = typename Return::storage_type1; \
- return {simd_cast<R0>(x, xs...), R1(0)}; \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector);
- Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask);
- #undef Vc_SIMDARRAY_CASTS
- #define Vc_SIMDARRAY_CASTS(SimdArrayType_,NativeType_) \
- \
- template <typename Return, int offset, typename T, typename A> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<Traits::isAtomic##SimdArrayType_<Return>::value, Return> \
- simd_cast(NativeType_<T, A> x Vc_DUMMY_ARG0) \
- { \
- vc_debug_("simd_cast{offset, atomic}(", ")\n", offset, x); \
- return {private_init, simd_cast<typename Return::storage_type, offset>(x)}; \
- } \
- \
- template <typename Return, int offset, typename T, typename A> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<(Traits::is##SimdArrayType_<Return>::value && \
- !Traits::isAtomic##SimdArrayType_<Return>::value && \
- Return::Size * offset + Common::left_size<Return::Size>() < \
- NativeType_<T, A>::Size), \
- Return> \
- simd_cast(NativeType_<T, A> x Vc_DUMMY_ARG1) \
- { \
- vc_debug_("simd_cast{offset, split Return}(", ")\n", offset, x); \
- using R0 = typename Return::storage_type0; \
- constexpr int entries_offset = offset * Return::Size; \
- constexpr int entries_offset_right = entries_offset + R0::Size; \
- return { \
- simd_cast_with_offset<typename Return::storage_type0, entries_offset>(x), \
- simd_cast_with_offset<typename Return::storage_type1, entries_offset_right>( \
- x)}; \
- } \
- \
- \
- template <typename Return, int offset, typename T, typename A> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<(Traits::is##SimdArrayType_<Return>::value && \
- !Traits::isAtomic##SimdArrayType_<Return>::value && \
- Return::Size * offset + Common::left_size<Return::Size>() >= \
- NativeType_<T, A>::Size), \
- Return> \
- simd_cast(NativeType_<T, A> x Vc_DUMMY_ARG2) \
- { \
- vc_debug_("simd_cast{offset, R1::Zero}(", ")\n", offset, x); \
- using R0 = typename Return::storage_type0; \
- using R1 = typename Return::storage_type1; \
- constexpr int entries_offset = offset * Return::Size; \
- return {simd_cast_with_offset<R0, entries_offset>(x), R1(0)}; \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector);
- Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask);
- #undef Vc_SIMDARRAY_CASTS
- #define Vc_SIMDARRAY_CASTS(SimdArrayType_) \
- \
- template <typename Return, typename T, std::size_t N, typename V, typename... From> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<(are_all_types_equal<SimdArrayType_<T, N, V, N>, From...>::value && \
- (sizeof...(From) == 0 || N * sizeof...(From) < Return::Size) && \
- !std::is_same<Return, SimdArrayType_<T, N, V, N>>::value), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, N> &x0, const From &... xs) \
- { \
- vc_debug_("simd_cast{indivisible}(", ")\n", x0, xs...); \
- return simd_cast<Return>(internal_data(x0), internal_data(xs)...); \
- } \
- \
- template <typename Return, typename T, std::size_t N, typename V, typename... From> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<(are_all_types_equal<SimdArrayType_<T, N, V, N>, From...>::value && \
- (sizeof...(From) > 0 && (N * sizeof...(From) >= Return::Size)) && \
- !std::is_same<Return, SimdArrayType_<T, N, V, N>>::value), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, N> &x0, const From &... xs) \
- { \
- vc_debug_("simd_cast{indivisible2}(", ")\n", x0, xs...); \
- return simd_cast_without_last<Return, \
- typename SimdArrayType_<T, N, V, N>::storage_type, \
- typename From::storage_type...>( \
- internal_data(x0), internal_data(xs)...); \
- } \
- \
- template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
- typename... From> \
- Vc_INTRINSIC Vc_CONST enable_if< \
- (N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
- !std::is_same<Return, SimdArrayType_<T, N, V, M>>::value && \
- is_less<N * sizeof...(From), Return::Size>::value && is_power_of_2<N>::value), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
- { \
- vc_debug_("simd_cast{bisectable}(", ")\n", x0, xs...); \
- return simd_cast_interleaved_argument_order< \
- Return, typename SimdArrayType_<T, N, V, M>::storage_type0, \
- typename From::storage_type0...>(internal_data0(x0), internal_data0(xs)..., \
- internal_data1(x0), internal_data1(xs)...); \
- } \
- \
- template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
- typename... From> \
- Vc_INTRINSIC Vc_CONST enable_if< \
- (N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
- !is_less<N * sizeof...(From), Return::Size>::value && is_power_of_2<N>::value), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
- { \
- vc_debug_("simd_cast{bisectable2}(", ")\n", x0, xs...); \
- return simd_cast_without_last<Return, SimdArrayType_<T, N, V, M>, From...>( \
- x0, xs...); \
- } \
- \
- template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
- typename... From> \
- Vc_INTRINSIC Vc_CONST enable_if< \
- (N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
- N * (1 + sizeof...(From)) <= Return::Size && !is_power_of_2<N>::value), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
- { \
- vc_debug_("simd_cast{remaining}(", ")\n", x0, xs...); \
- return simd_cast_impl_smaller_input<Return, N, SimdArrayType_<T, N, V, M>, \
- From...>(x0, xs...); \
- } \
- \
- template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
- typename... From> \
- Vc_INTRINSIC Vc_CONST enable_if< \
- (N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
- N * (1 + sizeof...(From)) > Return::Size && !is_power_of_2<N>::value), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
- { \
- vc_debug_("simd_cast{remaining2}(", ")\n", x0, xs...); \
- return simd_cast_impl_larger_input<Return, N, SimdArrayType_<T, N, V, M>, \
- From...>(x0, xs...); \
- } \
- \
- template <typename Return, typename T, std::size_t N, typename V, std::size_t M> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<(N != M && N >= 2 * Return::Size && is_power_of_2<N>::value), Return> \
- simd_cast(const SimdArrayType_<T, N, V, M> &x) \
- { \
- vc_debug_("simd_cast{single bisectable}(", ")\n", x); \
- return simd_cast<Return>(internal_data0(x)); \
- } \
- template <typename Return, typename T, std::size_t N, typename V, std::size_t M> \
- Vc_INTRINSIC Vc_CONST enable_if<(N != M && N > Return::Size && \
- N < 2 * Return::Size && is_power_of_2<N>::value), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, M> &x) \
- { \
- vc_debug_("simd_cast{single bisectable2}(", ")\n", x); \
- return simd_cast<Return>(internal_data0(x), internal_data1(x)); \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_SIMDARRAY_CASTS(SimdArray);
- Vc_SIMDARRAY_CASTS(SimdMaskArray);
- #undef Vc_SIMDARRAY_CASTS
- template <class Return, class T, int N, class... Ts,
- class = enable_if<!std::is_same<Return, fixed_size_simd<T, N>>::value>>
- Vc_INTRINSIC Return simd_cast(const fixed_size_simd<T, N> &x, const Ts &... xs)
- {
- return simd_cast<Return>(static_cast<const SimdArray<T, N> &>(x),
- static_cast<const SimdArray<T, N> &>(xs)...);
- }
- template <class Return, class T, int N, class... Ts,
- class = enable_if<!std::is_same<Return, fixed_size_simd_mask<T, N>>::value>>
- Vc_INTRINSIC Return simd_cast(const fixed_size_simd_mask<T, N> &x, const Ts &... xs)
- {
- return simd_cast<Return>(static_cast<const SimdMaskArray<T, N> &>(x),
- static_cast<const SimdMaskArray<T, N> &>(xs)...);
- }
- #define Vc_SIMDARRAY_CASTS(SimdArrayType_) \
- \
- template <typename Return, int offset, typename T, std::size_t N, typename V, \
- std::size_t M> \
- Vc_INTRINSIC Vc_CONST enable_if<(offset == 0), Return> simd_cast( \
- const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG0) \
- { \
- vc_debug_("simd_cast{offset == 0}(", ")\n", offset, x); \
- return simd_cast<Return>(x); \
- } \
- \
- template <typename Return, int offset, typename T, std::size_t N, typename V> \
- Vc_INTRINSIC Vc_CONST enable_if<(offset != 0), Return> simd_cast( \
- const SimdArrayType_<T, N, V, N> &x Vc_DUMMY_ARG1) \
- { \
- vc_debug_("simd_cast{offset, forward}(", ")\n", offset, x); \
- return simd_cast<Return, offset>(internal_data(x)); \
- } \
- \
- template <typename Return, int offset, typename T, std::size_t N, typename V, \
- std::size_t M> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<(N != M && offset * Return::Size >= Common::left_size<N>() && \
- offset != 0 && Common::left_size<N>() % Return::Size == 0), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG2) \
- { \
- vc_debug_("simd_cast{offset, right}(", ")\n", offset, x); \
- return simd_cast<Return, offset - Common::left_size<N>() / Return::Size>( \
- internal_data1(x)); \
- } \
- \
- template <typename Return, int offset, typename T, std::size_t N, typename V, \
- std::size_t M> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<(N != M && offset * Return::Size >= Common::left_size<N>() && \
- offset != 0 && Common::left_size<N>() % Return::Size != 0), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG3) \
- { \
- vc_debug_("simd_cast{offset, right, nofit}(", ")\n", offset, x); \
- return simd_cast_with_offset<Return, \
- offset * Return::Size - Common::left_size<N>()>( \
- internal_data1(x)); \
- } \
- \
- template <typename Return, int offset, typename T, std::size_t N, typename V, \
- std::size_t M> \
- Vc_INTRINSIC Vc_CONST enable_if< \
- (N != M && \
- offset != 0 && (offset + 1) * Return::Size <= Common::left_size<N>()), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG4) \
- { \
- vc_debug_("simd_cast{offset, left}(", ")\n", offset, x); \
- return simd_cast<Return, offset>(internal_data0(x)); \
- } \
- \
- template <typename Return, int offset, typename T, std::size_t N, typename V, \
- std::size_t M> \
- Vc_INTRINSIC Vc_CONST \
- enable_if<(N != M && (offset * Return::Size < Common::left_size<N>()) && \
- offset != 0 && (offset + 1) * Return::Size > Common::left_size<N>()), \
- Return> \
- simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG5) \
- { \
- vc_debug_("simd_cast{offset, copy scalars}(", ")\n", offset, x); \
- using R = typename Return::EntryType; \
- Return r = Return(0); \
- for (std::size_t i = offset * Return::Size; \
- i < std::min(N, (offset + 1) * Return::Size); ++i) { \
- r[i - offset * Return::Size] = static_cast<R>(x[i]); \
- } \
- return r; \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_SIMDARRAY_CASTS(SimdArray);
- Vc_SIMDARRAY_CASTS(SimdMaskArray);
- #undef Vc_SIMDARRAY_CASTS
- template <typename Return, typename From>
- Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x)
- {
- return simd_cast<Return>(x);
- }
- template <typename Return, typename... Froms>
- Vc_INTRINSIC Vc_CONST
- enable_if<(are_all_types_equal<Froms...>::value &&
- sizeof...(Froms) * first_type_of<Froms...>::Size < Return::Size),
- Return>
- simd_cast_drop_arguments(Froms... xs, first_type_of<Froms...> x)
- {
- return simd_cast<Return>(xs..., x);
- }
- template <typename Return, typename From, typename... Froms>
- Vc_INTRINSIC Vc_CONST enable_if<
- (are_all_types_equal<From, Froms...>::value &&
- (1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0),
- Return>
- simd_cast_drop_arguments(Froms... xs, From x, From)
- {
- return simd_cast_drop_arguments<Return, Froms...>(xs..., x);
- }
- template <typename Return, typename From>
- Vc_INTRINSIC Vc_CONST
- enable_if<(are_all_types_equal<From>::value && From::Size >= Return::Size), Return>
- simd_cast_drop_arguments(From x, From)
- {
- return simd_cast_drop_arguments<Return>(x);
- }
- template <typename Return, std::size_t offset, typename From>
- Vc_INTRINSIC Vc_CONST
- enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0),
- Return> simd_cast_with_offset(const From &x)
- {
- return simd_cast<Return, offset / Return::Size>(x);
- }
- template <typename Return, std::size_t offset, typename From>
- Vc_INTRINSIC Vc_CONST
- enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
- ((Traits::isSimdArray<Return>::value &&
- !Traits::isAtomicSimdArray<Return>::value) ||
- (Traits::isSimdMaskArray<Return>::value &&
- !Traits::isAtomicSimdMaskArray<Return>::value))),
- Return>
- simd_cast_with_offset(const From &x)
- {
- using R0 = typename Return::storage_type0;
- using R1 = typename Return::storage_type1;
- return {simd_cast_with_offset<R0, offset>(x),
- simd_cast_with_offset<R1, offset + R0::Size>(x)};
- }
- template <typename Return, std::size_t offset, typename From>
- Vc_INTRINSIC Vc_CONST
- enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
- ((Traits::isSimdArray<Return>::value &&
- Traits::isAtomicSimdArray<Return>::value) ||
- (Traits::isSimdMaskArray<Return>::value &&
- Traits::isAtomicSimdMaskArray<Return>::value))),
- Return>
- simd_cast_with_offset(const From &x)
- {
- return simd_cast<Return, offset / Return::Size>(x.shifted(offset % Return::Size));
- }
- template <typename Return, std::size_t offset, typename From, typename... Froms>
- Vc_INTRINSIC Vc_CONST
- enable_if<(are_all_types_equal<From, Froms...>::value && offset == 0), Return>
- simd_cast_with_offset(const From &x, const Froms &... xs)
- {
- return simd_cast<Return>(x, xs...);
- }
- template <typename Return, typename T, typename... From>
- Vc_INTRINSIC Vc_CONST Return simd_cast_without_last(const From &... xs, const T &)
- {
- return simd_cast<Return>(xs...);
- }
- #ifdef Vc_MSVC
- template <std::size_t I, typename T0>
- Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0, const T0 &)
- {
- return a0;
- }
- template <std::size_t I, typename T0>
- Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &, const T0 &b0)
- {
- return b0;
- }
- #endif
- template <std::size_t I, typename T0, typename... Ts>
- Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0,
- const Ts &...,
- const T0 &,
- const Ts &...)
- {
- return a0;
- }
- template <std::size_t I, typename T0, typename... Ts>
- Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &,
- const Ts &...,
- const T0 &b0,
- const Ts &...)
- {
- return b0;
- }
- template <std::size_t I, typename T0, typename... Ts>
- Vc_INTRINSIC Vc_CONST enable_if<(I > 1), T0> extract_interleaved(const T0 &,
- const Ts &... a,
- const T0 &,
- const Ts &... b)
- {
- return extract_interleaved<I - 2, Ts...>(a..., b...);
- }
- template <typename Return, typename... Ts, std::size_t... Indexes>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast_interleaved_argument_order_1(index_sequence<Indexes...>, const Ts &... a,
- const Ts &... b)
- {
- return simd_cast<Return>(extract_interleaved<Indexes, Ts...>(a..., b...)...);
- }
- template <typename Return, typename... Ts>
- Vc_INTRINSIC Vc_CONST Return
- simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b)
- {
- using seq = make_index_sequence<sizeof...(Ts)*2>;
- return simd_cast_interleaved_argument_order_1<Return, Ts...>(seq(), a..., b...);
- }
- #define Vc_CONDITIONAL_ASSIGN(name_,op_) \
- template <Operator O, typename T, std::size_t N, typename V, size_t VN, typename M, \
- typename U> \
- Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
- SimdArray<T, N, V, VN> &lhs, M &&mask, U &&rhs) \
- { \
- lhs(mask) op_ rhs; \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_CONDITIONAL_ASSIGN( Assign, =);
- Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
- Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
- Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
- Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
- Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
- Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
- Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
- Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
- Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
- Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
- #undef Vc_CONDITIONAL_ASSIGN
- #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
- template <Operator O, typename T, std::size_t N, typename V, size_t VN, typename M> \
- Vc_INTRINSIC enable_if<O == Operator::name_, SimdArray<T, N, V, VN>> \
- conditional_assign(SimdArray<T, N, V, VN> &lhs, M &&mask) \
- { \
- return expr_; \
- } \
- Vc_NOTHING_EXPECTING_SEMICOLON
- Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
- Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
- Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
- Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
- #undef Vc_CONDITIONAL_ASSIGN
- namespace Common
- {
- template <typename T, size_t N, typename V>
- inline void transpose_impl(
- TransposeTag<4, 4>, SimdArray<T, N, V, N> *Vc_RESTRICT r[],
- const TransposeProxy<SimdArray<T, N, V, N>, SimdArray<T, N, V, N>,
- SimdArray<T, N, V, N>, SimdArray<T, N, V, N>> &proxy)
- {
- V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]),
- &internal_data(*r[2]), &internal_data(*r[3])};
- transpose_impl(TransposeTag<4, 4>(), &r2[0],
- TransposeProxy<V, V, V, V>{internal_data(std::get<0>(proxy.in)),
- internal_data(std::get<1>(proxy.in)),
- internal_data(std::get<2>(proxy.in)),
- internal_data(std::get<3>(proxy.in))});
- }
- template <typename T, typename V>
- inline void transpose_impl(
- TransposeTag<2, 4>, SimdArray<T, 4, V, 1> *Vc_RESTRICT r[],
- const TransposeProxy<SimdArray<T, 2, V, 1>, SimdArray<T, 2, V, 1>,
- SimdArray<T, 2, V, 1>, SimdArray<T, 2, V, 1>> &proxy)
- {
- auto &lo = *r[0];
- auto &hi = *r[1];
- internal_data0(internal_data0(lo)) = internal_data0(std::get<0>(proxy.in));
- internal_data1(internal_data0(lo)) = internal_data0(std::get<1>(proxy.in));
- internal_data0(internal_data1(lo)) = internal_data0(std::get<2>(proxy.in));
- internal_data1(internal_data1(lo)) = internal_data0(std::get<3>(proxy.in));
- internal_data0(internal_data0(hi)) = internal_data1(std::get<0>(proxy.in));
- internal_data1(internal_data0(hi)) = internal_data1(std::get<1>(proxy.in));
- internal_data0(internal_data1(hi)) = internal_data1(std::get<2>(proxy.in));
- internal_data1(internal_data1(hi)) = internal_data1(std::get<3>(proxy.in));
- }
- template <typename T, typename V>
- inline void transpose_impl(
- TransposeTag<4, 4>, SimdArray<T, 1, V, 1> *Vc_RESTRICT r[],
- const TransposeProxy<SimdArray<T, 1, V, 1>, SimdArray<T, 1, V, 1>,
- SimdArray<T, 1, V, 1>, SimdArray<T, 1, V, 1>> &proxy)
- {
- V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]),
- &internal_data(*r[2]), &internal_data(*r[3])};
- transpose_impl(TransposeTag<4, 4>(), &r2[0],
- TransposeProxy<V, V, V, V>{internal_data(std::get<0>(proxy.in)),
- internal_data(std::get<1>(proxy.in)),
- internal_data(std::get<2>(proxy.in)),
- internal_data(std::get<3>(proxy.in))});
- }
- template <typename T, size_t N, typename V>
- inline void transpose_impl(
- TransposeTag<4, 4>, SimdArray<T, N, V, 1> *Vc_RESTRICT r[],
- const TransposeProxy<SimdArray<T, N, V, 1>, SimdArray<T, N, V, 1>,
- SimdArray<T, N, V, 1>, SimdArray<T, N, V, 1>> &proxy)
- {
- SimdArray<T, N, V, 1> *Vc_RESTRICT r0[4 / 2] = {r[0], r[1]};
- SimdArray<T, N, V, 1> *Vc_RESTRICT r1[4 / 2] = {r[2], r[3]};
- using H = SimdArray<T, 2>;
- transpose_impl(TransposeTag<2, 4>(), &r0[0],
- TransposeProxy<H, H, H, H>{internal_data0(std::get<0>(proxy.in)),
- internal_data0(std::get<1>(proxy.in)),
- internal_data0(std::get<2>(proxy.in)),
- internal_data0(std::get<3>(proxy.in))});
- transpose_impl(TransposeTag<2, 4>(), &r1[0],
- TransposeProxy<H, H, H, H>{internal_data1(std::get<0>(proxy.in)),
- internal_data1(std::get<1>(proxy.in)),
- internal_data1(std::get<2>(proxy.in)),
- internal_data1(std::get<3>(proxy.in))});
- }
- }
- namespace Detail
- {
- template <class T, size_t N, class V, size_t VSizeof>
- struct InterleaveImpl<SimdArray<T, N, V, N>, N, VSizeof> {
- template <class I, class... VV>
- static Vc_INTRINSIC void interleave(T *const data, const I &i, const VV &... vv)
- {
- InterleaveImpl<V, N, VSizeof>::interleave(data, i, internal_data(vv)...);
- }
- template <class I, class... VV>
- static Vc_INTRINSIC void deinterleave(T const *const data, const I &i, VV &... vv)
- {
- InterleaveImpl<V, N, VSizeof>::deinterleave(data, i, internal_data(vv)...);
- }
- };
- }
- }
- namespace std
- {
- template <typename T, size_t N, typename V, size_t VN>
- struct numeric_limits<Vc::SimdArray<T, N, V, VN>> : public numeric_limits<T> {
- private:
- using R = Vc::SimdArray<T, N, V, VN>;
- public:
- static Vc_ALWAYS_INLINE Vc_CONST R max() noexcept { return numeric_limits<T>::max(); }
- static Vc_ALWAYS_INLINE Vc_CONST R min() noexcept { return numeric_limits<T>::min(); }
- static Vc_ALWAYS_INLINE Vc_CONST R lowest() noexcept
- {
- return numeric_limits<T>::lowest();
- }
- static Vc_ALWAYS_INLINE Vc_CONST R epsilon() noexcept
- {
- return numeric_limits<T>::epsilon();
- }
- static Vc_ALWAYS_INLINE Vc_CONST R round_error() noexcept
- {
- return numeric_limits<T>::round_error();
- }
- static Vc_ALWAYS_INLINE Vc_CONST R infinity() noexcept
- {
- return numeric_limits<T>::infinity();
- }
- static Vc_ALWAYS_INLINE Vc_CONST R quiet_NaN() noexcept
- {
- return numeric_limits<T>::quiet_NaN();
- }
- static Vc_ALWAYS_INLINE Vc_CONST R signaling_NaN() noexcept
- {
- return numeric_limits<T>::signaling_NaN();
- }
- static Vc_ALWAYS_INLINE Vc_CONST R denorm_min() noexcept
- {
- return numeric_limits<T>::denorm_min();
- }
- };
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <typename T, typename Abi, typename U>
- enable_if<!std::is_same<T, U>::value, U> is_convertible_to_any_vector(Vector<U, Abi>);
- template <typename T, typename Abi> T is_convertible_to_any_vector(Vector<T, Abi>);
- template <typename T, typename U, bool = std::is_integral<T>::value,
- bool = std::is_integral<U>::value>
- struct FundamentalReturnType;
- template <class T, class U>
- using fundamental_return_t = typename FundamentalReturnType<T, U>::type;
- template <typename T, typename U> struct FundamentalReturnType<T, U, false, false> {
- using type = typename std::conditional<
- std::is_arithmetic<U>::value,
- typename std::conditional<(sizeof(T) < sizeof(U)), U, T>::type,
- T>::type;
- };
- template <typename T, typename U> struct FundamentalReturnType<T, U, true, false> {
- using type = typename std::conditional<
- std::is_arithmetic<U>::value, U,
- T>::type;
- };
- template <typename T, typename U> struct FundamentalReturnType<T, U, false, true> {
- using type = T;
- };
- template <typename T> struct my_make_signed : public std::make_signed<T> {
- };
- template <> struct my_make_signed<bool> {
- using type = bool;
- };
- template <typename TT, typename UU>
- struct higher_conversion_rank {
- template <typename A>
- using fix_sign =
- typename std::conditional<(std::is_unsigned<TT>::value ||
- std::is_unsigned<UU>::value),
- typename std::make_unsigned<A>::type, A>::type;
- using T = typename my_make_signed<TT>::type;
- using U = typename my_make_signed<UU>::type;
- template <typename Test, typename Otherwise>
- using c = typename std::conditional<std::is_same<T, Test>::value ||
- std::is_same<U, Test>::value,
- Test, Otherwise>::type;
- using type = fix_sign<c<long long, c<long, c<int, c<short, c<signed char, void>>>>>>;
- };
- template <typename T, typename U> struct FundamentalReturnType<T, U, true, true> {
- template <bool B, class Then, class E>
- using c = typename std::conditional<B, Then, E>::type;
- using type =
- c<(sizeof(T) > sizeof(U)), T,
- c<(sizeof(T) < sizeof(U)), U, typename higher_conversion_rank<T, U>::type>>;
- };
- template <class V, class T, class Tq, class = void> struct ReturnTypeImpl {
- };
- template <class T, class U, class Abi, class Uq>
- struct ReturnTypeImpl<Vector<T, Abi>, Vector<U, Abi>, Uq, void> {
- using type = Vc::Vector<fundamental_return_t<T, U>, Abi>;
- };
- template <class T, class Abi, class Uq>
- struct ReturnTypeImpl<Vector<T, Abi>, int, Uq, void> {
- using type = Vc::Vector<T, Abi>;
- };
- template <class T, class Abi, class Uq>
- struct ReturnTypeImpl<Vector<T, Abi>, uint, Uq, void> {
- using type = Vc::Vector<
- typename std::conditional<std::is_integral<T>::value, std::make_unsigned<T>,
- std::enable_if<true, T>>::type::type,
- Abi>;
- };
- template <class T, class U, class Abi, class Uq>
- struct ReturnTypeImpl<
- Vector<T, Abi>, U, Uq,
- enable_if<!std::is_class<U>::value && !std::is_same<U, int>::value &&
- !std::is_same<U, uint>::value &&
- Traits::is_valid_vector_argument<fundamental_return_t<T, U>>::value,
- void>> {
- using type = Vc::Vector<fundamental_return_t<T, U>, Abi>;
- };
- template <class T, class U, class Abi, class Uq>
- struct ReturnTypeImpl<
- Vector<T, Abi>, U, Uq,
- enable_if<std::is_class<U>::value && !Traits::is_simd_vector<U>::value &&
- Traits::is_valid_vector_argument<decltype(
- is_convertible_to_any_vector<T, Abi>(std::declval<Uq>()))>::value,
- void>> {
- using type =
- Vc::Vector<fundamental_return_t<T, decltype(is_convertible_to_any_vector<T, Abi>(
- std::declval<Uq>()))>,
- Abi>;
- };
- template <class V, class Tq, class T = remove_cvref_t<Tq>>
- using ReturnType = typename ReturnTypeImpl<V, T, Tq>::type;
- template <class T> struct is_a_type : public std::true_type {
- };
- #ifdef Vc_ENABLE_FLOAT_BIT_OPERATORS
- #define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) true
- #else
- #define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) \
- Detail::is_a_type<decltype(std::declval<typename R::value_type>() \
- op_ std::declval<typename R::value_type>())>::value
- #endif
- }
- #define Vc_GENERIC_OPERATOR(op_) \
- template <class T, class Abi, class U, \
- class R = Detail::ReturnType<Vector<T, Abi>, U>> \
- Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
- std::is_convertible<Vector<T, Abi>, R>::value && \
- std::is_convertible<U, R>::value, \
- R> \
- operator op_(Vector<T, Abi> x, U &&y) \
- { \
- return Detail::operator op_(R(x), R(std::forward<U>(y))); \
- } \
- template <class T, class Abi, class U, \
- class R = Detail::ReturnType<Vector<T, Abi>, U>> \
- Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
- !Traits::is_simd_vector<U>::value && \
- std::is_convertible<Vector<T, Abi>, R>::value && \
- std::is_convertible<U, R>::value, \
- R> \
- operator op_(U &&x, Vector<T, Abi> y) \
- { \
- return Detail::operator op_(R(std::forward<U>(x)), R(y)); \
- } \
- template <class T, class Abi, class U, \
- class R = Detail::ReturnType<Vector<T, Abi>, U>> \
- Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
- std::is_convertible<Vector<T, Abi>, R>::value && \
- std::is_convertible<U, R>::value, \
- Vector<T, Abi> &> \
- operator op_##=(Vector<T, Abi> &x, U &&y) \
- { \
- x = Detail::operator op_(R(x), R(std::forward<U>(y))); \
- return x; \
- }
- #define Vc_LOGICAL_OPERATOR(op_) \
- template <class T, class Abi> \
- Vc_ALWAYS_INLINE typename Vector<T, Abi>::Mask operator op_(Vector<T, Abi> x, \
- Vector<T, Abi> y) \
- { \
- return !!x op_ !!y; \
- } \
- template <class T, class Abi, class U> \
- Vc_ALWAYS_INLINE \
- enable_if<std::is_convertible<Vector<T, Abi>, Vector<U, Abi>>::value && \
- std::is_convertible<Vector<U, Abi>, Vector<T, Abi>>::value, \
- typename Detail::ReturnType<Vector<T, Abi>, Vector<U, Abi>>::Mask> \
- operator op_(Vector<T, Abi> x, Vector<U, Abi> y) \
- { \
- return !!x op_ !!y; \
- } \
- template <class T, class Abi, class U> \
- Vc_ALWAYS_INLINE enable_if<std::is_same<bool, decltype(!std::declval<U>())>::value, \
- typename Vector<T, Abi>::Mask> \
- operator op_(Vector<T, Abi> x, U &&y) \
- { \
- using M = typename Vector<T, Abi>::Mask; \
- return !!x op_ M(!!std::forward<U>(y)); \
- } \
- template <class T, class Abi, class U> \
- Vc_ALWAYS_INLINE enable_if<std::is_same<bool, decltype(!std::declval<U>())>::value, \
- typename Vector<T, Abi>::Mask> \
- operator op_(U &&x, Vector<T, Abi> y) \
- { \
- using M = typename Vector<T, Abi>::Mask; \
- return M(!!std::forward<U>(x)) op_ !!y; \
- }
- #define Vc_COMPARE_OPERATOR(op_) \
- template <class T, class Abi, class U, \
- class R = Detail::ReturnType<Vector<T, Abi>, U>> \
- Vc_ALWAYS_INLINE enable_if<std::is_convertible<Vector<T, Abi>, R>::value && \
- std::is_convertible<U, R>::value, \
- typename R::Mask> \
- operator op_(Vector<T, Abi> x, U &&y) \
- { \
- return Detail::operator op_(R(x), R(std::forward<U>(y))); \
- } \
- template <class T, class Abi, class U, \
- class R = Detail::ReturnType<Vector<T, Abi>, U>> \
- Vc_ALWAYS_INLINE \
- enable_if<!Traits::is_simd_vector_internal<remove_cvref_t<U>>::value && \
- std::is_convertible<Vector<T, Abi>, R>::value && \
- std::is_convertible<U, R>::value, \
- typename R::Mask> \
- operator op_(U &&x, Vector<T, Abi> y) \
- { \
- return Detail::operator op_(R(std::forward<U>(x)), R(y)); \
- }
- Vc_ALL_LOGICAL (Vc_LOGICAL_OPERATOR);
- Vc_ALL_BINARY (Vc_GENERIC_OPERATOR);
- Vc_ALL_ARITHMETICS(Vc_GENERIC_OPERATOR);
- Vc_ALL_COMPARES (Vc_COMPARE_OPERATOR);
- #undef Vc_LOGICAL_OPERATOR
- #undef Vc_GENERIC_OPERATOR
- #undef Vc_COMPARE_OPERATOR
- #undef Vc_INVALID_OPERATOR
- }
- #endif
- #ifndef VC_COMMON_ALIGNEDBASE_H_
- #define VC_COMMON_ALIGNEDBASE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <typename T> constexpr T max(T a) { return a; }
- template <typename T, typename... Ts> constexpr T max(T a, T b, Ts... rest)
- {
- return a > b ? max(a, rest...) : max(b, rest...);
- }
- }
- namespace Common
- {
- template <std::size_t> Vc_INTRINSIC void *aligned_malloc(std::size_t);
- Vc_ALWAYS_INLINE void free(void *);
- }
- template <std::size_t Alignment> struct alignas(Alignment) AlignedBase
- {
- Vc_FREE_STORE_OPERATORS_ALIGNED(Alignment);
- };
- using VectorAlignedBase = AlignedBase<
- Detail::max(alignof(Vector<float>), alignof(Vector<double>), alignof(Vector<ullong>),
- alignof(Vector<llong>), alignof(Vector<ulong>), alignof(Vector<long>),
- alignof(Vector<uint>), alignof(Vector<int>), alignof(Vector<ushort>),
- alignof(Vector<short>), alignof(Vector<uchar>), alignof(Vector<schar>))>;
- template <typename V> using VectorAlignedBaseT = AlignedBase<alignof(V)>;
- using MemoryAlignedBase = AlignedBase<
- Detail::max(Vector<float>::MemoryAlignment, Vector<double>::MemoryAlignment,
- Vector<ullong>::MemoryAlignment, Vector<llong>::MemoryAlignment,
- Vector<ulong>::MemoryAlignment, Vector<long>::MemoryAlignment,
- Vector<uint>::MemoryAlignment, Vector<int>::MemoryAlignment,
- Vector<ushort>::MemoryAlignment, Vector<short>::MemoryAlignment,
- Vector<uchar>::MemoryAlignment, Vector<schar>::MemoryAlignment)>;
- template <typename V> using MemoryAlignedBaseT = AlignedBase<V::MemoryAlignment>;
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE {
- constexpr std::size_t VectorAlignment = alignof(VectorAlignedBase);
- constexpr std::size_t MemoryAlignment = alignof(MemoryAlignedBase);
- }
- #define Vc_VECTOR_DECLARED_ 1
- #ifndef VC_SCALAR_DEINTERLEAVE_H_
- #define VC_SCALAR_DEINTERLEAVE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <typename T, typename M, typename A>
- Vc_ALWAYS_INLINE void deinterleave(Scalar::Vector<T> &a, Scalar::Vector<T> &b,
- const M *mem, A)
- {
- a = mem[0];
- b = mem[1];
- }
- Vc_ALWAYS_INLINE void prefetchForOneRead(const void *, VectorAbi::Scalar) {}
- Vc_ALWAYS_INLINE void prefetchForModify(const void *, VectorAbi::Scalar) {}
- Vc_ALWAYS_INLINE void prefetchClose(const void *, VectorAbi::Scalar) {}
- Vc_ALWAYS_INLINE void prefetchMid(const void *, VectorAbi::Scalar) {}
- Vc_ALWAYS_INLINE void prefetchFar(const void *, VectorAbi::Scalar) {}
- }
- }
- #endif
- #ifndef VC_SCALAR_MATH_H_
- #define VC_SCALAR_MATH_H_
- #include <cstdlib>
- namespace Vc_VERSIONED_NAMESPACE
- {
- Vc_INTRINSIC Scalar::float_v copysign(Scalar::float_v mag, Scalar::float_v sign)
- {
- union {
- float f;
- unsigned int i;
- } value, s;
- value.f = mag.data();
- s.f = sign.data();
- value.i = (s.i & 0x80000000u) | (value.i & 0x7fffffffu);
- return Scalar::float_v{value.f};
- }
- Vc_INTRINSIC Vc_CONST Scalar::double_v copysign(Scalar::double_v mag,
- Scalar::double_v sign)
- {
- union {
- double f;
- unsigned long long i;
- } value, s;
- value.f = mag.data();
- s.f = sign.data();
- value.i = (s.i & 0x8000000000000000ull) | (value.i & 0x7fffffffffffffffull);
- return Scalar::double_v{value.f};
- }
- #define Vc_MINMAX(V) \
- static Vc_ALWAYS_INLINE Scalar::V min(const Scalar::V &x, const Scalar::V &y) \
- { \
- return Scalar::V(std::min(x.data(), y.data())); \
- } \
- static Vc_ALWAYS_INLINE Scalar::V max(const Scalar::V &x, const Scalar::V &y) \
- { \
- return Scalar::V(std::max(x.data(), y.data())); \
- }
- Vc_ALL_VECTOR_TYPES(Vc_MINMAX);
- #undef Vc_MINMAX
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> sqrt (const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::sqrt(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> rsqrt(const Scalar::Vector<T> &x)
- {
- const typename Vector<T, VectorAbi::Scalar>::EntryType one = 1; return Scalar::Vector<T>(one / std::sqrt(x.data()));
- }
- template <typename T,
- typename = enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value ||
- std::is_same<T, short>::value ||
- std::is_same<T, int>::value>>
- Vc_ALWAYS_INLINE Vc_PURE Scalar::Vector<T> abs(Scalar::Vector<T> x)
- {
- return std::abs(x.data());
- }
- template<typename T> static Vc_ALWAYS_INLINE void sincos(const Scalar::Vector<T> &x, Scalar::Vector<T> *sin, Scalar::Vector<T> *cos)
- {
- #if defined(_WIN32) || defined(__APPLE__)
- sin->data() = std::sin(x.data());
- cos->data() = std::cos(x.data());
- #elif Vc_HAS_BUILTIN(__builtin_sincosf) || defined Vc_GCC
- __builtin_sincosf(x.data(), &sin->data(), &cos->data());
- #else
- sincosf(x.data(), &sin->data(), &cos->data());
- #endif
- }
- template<> Vc_ALWAYS_INLINE void sincos(const Scalar::Vector<double> &x, Scalar::Vector<double> *sin, Scalar::Vector<double> *cos)
- {
- #if defined(_WIN32) || defined(__APPLE__)
- sin->data() = std::sin(x.data());
- cos->data() = std::cos(x.data());
- #elif Vc_HAS_BUILTIN(__builtin_sincos) || defined Vc_GCC
- __builtin_sincos(x.data(), &sin->data(), &cos->data());
- #else
- ::sincos(x.data(), &sin->data(), &cos->data());
- #endif
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> sin (const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::sin(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> asin (const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::asin(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> cos (const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::cos(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> log (const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::log(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> log10(const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::log10(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> log2(const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::log2(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> exp (const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::exp(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> atan (const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::atan( x.data() ));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> atan2(const Scalar::Vector<T> &x, const Scalar::Vector<T> &y)
- {
- return Scalar::Vector<T>(std::atan2( x.data(), y.data() ));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> trunc(const Scalar::Vector<T> &x)
- {
- return std::trunc(x.data());
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> floor(const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::floor(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> ceil(const Scalar::Vector<T> &x)
- {
- return Scalar::Vector<T>(std::ceil(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> round(const Scalar::Vector<T> &x)
- {
- return x;
- }
- namespace
- {
- template<typename T> bool _realIsEvenHalf(T x) {
- const T two = 2;
- const T half = 0.5;
- const T f = std::floor(x * half) * two;
- return (x - f) == half;
- }
- }
- template<> Vc_ALWAYS_INLINE Scalar::Vector<float> round(const Scalar::Vector<float> &x)
- {
- return Scalar::float_v(std::floor(x.data() + 0.5f) - (_realIsEvenHalf(x.data()) ? 1.f : 0.f));
- }
- template<> Vc_ALWAYS_INLINE Scalar::Vector<double> round(const Scalar::Vector<double> &x)
- {
- return Scalar::double_v(std::floor(x.data() + 0.5 ) - (_realIsEvenHalf(x.data()) ? 1. : 0. ));
- }
- template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> reciprocal(const Scalar::Vector<T> &x)
- {
- const typename Vector<T, VectorAbi::Scalar>::EntryType one = 1; return Scalar::Vector<T>(one / x.data());
- }
- #ifdef isfinite
- #undef isfinite
- #endif
- #ifdef isnan
- #undef isnan
- #endif
- template<typename T> static Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Scalar>::Mask isfinite(const Scalar::Vector<T> &x)
- {
- return typename Vector<T, VectorAbi::Scalar>::Mask(
- #ifdef _MSC_VER
- !!_finite(x.data())
- #elif defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1500
- ::isfinite(x.data())
- #else
- std::isfinite(x.data())
- #endif
- );
- }
- template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Scalar>::Mask isinf(const Scalar::Vector<T> &x)
- {
- return typename Vector<T, VectorAbi::Scalar>::Mask(std::isinf(x.data()));
- }
- template<typename T> static Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Scalar>::Mask isnan(const Scalar::Vector<T> &x)
- {
- return typename Vector<T, VectorAbi::Scalar>::Mask(
- #ifdef _MSC_VER
- !!_isnan(x.data())
- #elif defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1500
- ::isnan(x.data())
- #else
- std::isnan(x.data())
- #endif
- );
- }
- Vc_ALWAYS_INLINE Scalar::Vector<float> frexp(Scalar::Vector<float> x, SimdArray<int, 1, Scalar::Vector<int>, 1> *e) {
- return Scalar::float_v(std::frexp(x.data(), &internal_data(*e).data()));
- }
- Vc_ALWAYS_INLINE Scalar::Vector<double> frexp(Scalar::Vector<double> x, SimdArray<int, 1, Scalar::Vector<int>, 1> *e) {
- return Scalar::double_v(std::frexp(x.data(), &internal_data(*e).data()));
- }
- Vc_ALWAYS_INLINE Scalar::Vector<float> ldexp(Scalar::Vector<float> x, const SimdArray<int, 1, Scalar::Vector<int>, 1> &e) {
- return Scalar::float_v(std::ldexp(x.data(), internal_data(e).data()));
- }
- Vc_ALWAYS_INLINE Scalar::Vector<double> ldexp(Scalar::Vector<double> x, const SimdArray<int, 1, Scalar::Vector<int>, 1> &e) {
- return Scalar::double_v(std::ldexp(x.data(), internal_data(e).data()));
- }
- template <typename T>
- Vc_ALWAYS_INLINE Vector<T, VectorAbi::Scalar> fma(Vector<T, VectorAbi::Scalar> a,
- Vector<T, VectorAbi::Scalar> b,
- Vector<T, VectorAbi::Scalar> c)
- {
- if (std::is_integral<T>::value) {
- return a * b + c;
- } else {
- return std::fma(a.data(), b.data(), c.data());
- }
- }
- }
- #endif
- #ifndef Vc_SCALAR_SIMD_CAST_CALLER_TCC_
- #define Vc_SCALAR_SIMD_CAST_CALLER_TCC_
- namespace Vc_VERSIONED_NAMESPACE
- {
- #if Vc_IS_VERSION_1
- template <typename T>
- template <typename U>
- Vc_INTRINSIC Mask<T, VectorAbi::Scalar>::Mask(
- U &&rhs, Common::enable_if_mask_converts_explicitly<T, U>)
- : Mask(simd_cast<Mask>(std::forward<U>(rhs)))
- {
- }
- #endif
- }
- #endif
- #if defined(Vc_IMPL_SSE)
- #ifndef VC_SSE_DEINTERLEAVE_H_
- #define VC_SSE_DEINTERLEAVE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <typename A>
- inline void deinterleave(SSE::float_v &, SSE::float_v &, const float *, A);
- template <typename A>
- inline void deinterleave(SSE::float_v &, SSE::float_v &, const short *, A);
- template <typename A>
- inline void deinterleave(SSE::float_v &, SSE::float_v &, const ushort *, A);
- template <typename A>
- inline void deinterleave(SSE::double_v &, SSE::double_v &, const double *, A);
- template <typename A>
- inline void deinterleave(SSE::int_v &, SSE::int_v &, const int *, A);
- template <typename A>
- inline void deinterleave(SSE::int_v &, SSE::int_v &, const short *, A);
- template <typename A>
- inline void deinterleave(SSE::uint_v &, SSE::uint_v &, const uint *, A);
- template <typename A>
- inline void deinterleave(SSE::uint_v &, SSE::uint_v &, const ushort *, A);
- template <typename A>
- inline void deinterleave(SSE::short_v &, SSE::short_v &, const short *, A);
- template <typename A>
- inline void deinterleave(SSE::ushort_v &, SSE::ushort_v &, const ushort *, A);
- Vc_ALWAYS_INLINE_L void prefetchForOneRead(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L void prefetchForModify(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L void prefetchClose(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L void prefetchMid(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE_L void prefetchFar(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
- }
- }
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SSE
- {
- inline void deinterleave(Vector<float> &a, Vector<float> &b)
- {
- const __m128 tmp0 = _mm_unpacklo_ps(a.data(), b.data());
- const __m128 tmp1 = _mm_unpackhi_ps(a.data(), b.data());
- a.data() = _mm_unpacklo_ps(tmp0, tmp1);
- b.data() = _mm_unpackhi_ps(tmp0, tmp1);
- }
- inline void deinterleave(Vector<float> &a, Vector<float> &b, Vector<short>::AsArg tmp)
- {
- a.data() = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16));
- b.data() = _mm_cvtepi32_ps(_mm_srai_epi32(tmp.data(), 16));
- }
- inline void deinterleave(Vector<float> &a, Vector<float> &b, Vector<unsigned short>::AsArg tmp)
- {
- a.data() = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16));
- b.data() = _mm_cvtepi32_ps(_mm_srli_epi32(tmp.data(), 16));
- }
- inline void deinterleave(Vector<double> &a, Vector<double> &b)
- {
- __m128d tmp = _mm_unpacklo_pd(a.data(), b.data());
- b.data() = _mm_unpackhi_pd(a.data(), b.data());
- a.data() = tmp;
- }
- inline void deinterleave(Vector<int> &a, Vector<int> &b)
- {
- const __m128i tmp0 = _mm_unpacklo_epi32(a.data(), b.data());
- const __m128i tmp1 = _mm_unpackhi_epi32(a.data(), b.data());
- a.data() = _mm_unpacklo_epi32(tmp0, tmp1);
- b.data() = _mm_unpackhi_epi32(tmp0, tmp1);
- }
- inline void deinterleave(Vector<unsigned int> &a, Vector<unsigned int> &b)
- {
- const __m128i tmp0 = _mm_unpacklo_epi32(a.data(), b.data());
- const __m128i tmp1 = _mm_unpackhi_epi32(a.data(), b.data());
- a.data() = _mm_unpacklo_epi32(tmp0, tmp1);
- b.data() = _mm_unpackhi_epi32(tmp0, tmp1);
- }
- inline void deinterleave(Vector<short> &a, Vector<short> &b)
- {
- __m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data());
- __m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data());
- __m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
- __m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
- a.data() = _mm_unpacklo_epi16(tmp2, tmp3);
- b.data() = _mm_unpackhi_epi16(tmp2, tmp3);
- }
- inline void deinterleave(Vector<unsigned short> &a, Vector<unsigned short> &b)
- {
- __m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data());
- __m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data());
- __m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
- __m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
- a.data() = _mm_unpacklo_epi16(tmp2, tmp3);
- b.data() = _mm_unpackhi_epi16(tmp2, tmp3);
- }
- inline void deinterleave(Vector<int> &a, Vector<int> &b, Vector<short>::AsArg tmp)
- {
- a.data() = _mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16);
- b.data() = _mm_srai_epi32(tmp.data(), 16);
- }
- inline void deinterleave(Vector<unsigned int> &a, Vector<unsigned int> &b, Vector<unsigned short>::AsArg tmp)
- {
- a.data() = _mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16);
- b.data() = _mm_srli_epi32(tmp.data(), 16);
- }
- }
- }
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template<typename A> inline void deinterleave(
- SSE::float_v &a, SSE::float_v &b, const float *m, A align)
- {
- a.load(m, align);
- b.load(m + SSE::float_v::Size, align);
- Vc::SSE::deinterleave(a, b);
- }
- template<typename A> inline void deinterleave(
- SSE::float_v &a, SSE::float_v &b, const short *m, A align)
- {
- SSE::short_v tmp(m, align);
- Vc::SSE::deinterleave(a, b, tmp);
- }
- template<typename A> inline void deinterleave(
- SSE::float_v &a, SSE::float_v &b, const unsigned short *m, A align)
- {
- SSE::ushort_v tmp(m, align);
- Vc::SSE::deinterleave(a, b, tmp);
- }
- template<typename A> inline void deinterleave(
- SSE::double_v &a, SSE::double_v &b, const double *m, A align)
- {
- a.load(m, align);
- b.load(m + SSE::double_v::Size, align);
- Vc::SSE::deinterleave(a, b);
- }
- template<typename A> inline void deinterleave(
- SSE::int_v &a, SSE::int_v &b, const int *m, A align)
- {
- a.load(m, align);
- b.load(m + SSE::int_v::Size, align);
- Vc::SSE::deinterleave(a, b);
- }
- template<typename A> inline void deinterleave(
- SSE::int_v &a, SSE::int_v &b, const short *m, A align)
- {
- SSE::short_v tmp(m, align);
- Vc::SSE::deinterleave(a, b, tmp);
- }
- template<typename A> inline void deinterleave(
- SSE::uint_v &a, SSE::uint_v &b, const unsigned int *m, A align)
- {
- a.load(m, align);
- b.load(m + SSE::uint_v::Size, align);
- Vc::SSE::deinterleave(a, b);
- }
- template<typename A> inline void deinterleave(
- SSE::uint_v &a, SSE::uint_v &b, const unsigned short *m, A align)
- {
- SSE::ushort_v tmp(m, align);
- Vc::SSE::deinterleave(a, b, tmp);
- }
- template<typename A> inline void deinterleave(
- SSE::short_v &a, SSE::short_v &b, const short *m, A align)
- {
- a.load(m, align);
- b.load(m + SSE::short_v::Size, align);
- Vc::SSE::deinterleave(a, b);
- }
- template<typename A> inline void deinterleave(
- SSE::ushort_v &a, SSE::ushort_v &b, const unsigned short *m, A align)
- {
- a.load(m, align);
- b.load(m + SSE::ushort_v::Size, align);
- Vc::SSE::deinterleave(a, b);
- }
- }
- }
- #ifndef VC_SSE_PREFETCHES_TCC_
- #define VC_SSE_PREFETCHES_TCC_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Sse)
- {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_NTA);
- }
- Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Sse)
- {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T0);
- }
- Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Sse)
- {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T1);
- }
- Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Sse)
- {
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T2);
- }
- Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Sse)
- {
- #ifdef __3dNOW__
- _m_prefetchw(const_cast<void *>(addr));
- #else
- _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T0);
- #endif
- }
- }
- }
- #endif
- #endif
- #ifndef VC_SSE_MATH_H_
- #define VC_SSE_MATH_H_
- #ifndef VC_SSE_CONST_H_
- #define VC_SSE_CONST_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace SSE
- {
- template<typename T> struct Const
- {
- typedef Vector<T> V;
- typedef Mask<T> M;
- enum Constants { Stride = 16 / sizeof(T) };
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return load(&c_trig<T>::data[0 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return load(&c_trig<T>::data[1 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return load(&c_trig<T>::data[2 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return load(&c_trig<T>::data[3 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return load(&c_trig<T>::data[4 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _16() { return load(&c_trig<T>::data[5 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return load(&c_trig<T>::data[(12 + i) * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return load(&c_trig<T>::data[(17 + i) * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return load(&c_trig<T>::data[22 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return load(&c_trig<T>::data[23 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return load(&c_trig<T>::data[24 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return load(&c_trig<T>::data[8 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return load(&c_trig<T>::data[9 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return load(&c_trig<T>::data[10 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return load(&c_trig<T>::data[11 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return load(&c_trig<T>::data[(28 + i) * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return load(&c_trig<T>::data[(33 + i) * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return load(&c_trig<T>::data[(37 + i) * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return load(&c_trig<T>::data[(43 + i) * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return load(&c_trig<T>::data[25 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return load(&c_trig<T>::data[26 * Stride]); }
- static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(load(c_log<T>::d(1)).data()); }
- static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return load(c_log<T>::d(18)); }
- static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return load(c_log<T>::d(15)); }
- static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return load(c_log<T>::d(2 + i)); }
- static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return load(c_log<T>::d(8 + i)); }
- static Vc_ALWAYS_INLINE Vc_CONST V min() { return load(c_log<T>::d(14)); }
- static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return load(c_log<T>::d(17)); }
- static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return load(c_log<T>::d(16)); }
- static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return load(c_log<T>::d(13)); }
- static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return load(c_log<T>::d(19)); }
- static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return load(c_log<T>::d(20)); }
- static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R;
- static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R;
- private:
- static Vc_ALWAYS_INLINE_L Vc_CONST_L V load(const T *mem) Vc_ALWAYS_INLINE_R Vc_CONST_R;
- };
- template<typename T> Vc_ALWAYS_INLINE Vc_CONST Vector<T> Const<T>::load(const T *mem) { return V(mem); }
- template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask()
- {
- return Vector<float>(reinterpret_cast<const float *>(&c_general::highMaskFloat));
- }
- template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask()
- {
- return Vector<double>(
- reinterpret_cast<const double *>(&c_general::highMaskDouble));
- }
- template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask(int bits)
- {
- return _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits));
- }
- template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask(int bits)
- {
- return _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits));
- }
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- Vc_INTRINSIC Vc_CONST SSE::float_v copysign(SSE::float_v mag, SSE::float_v sign)
- {
- return _mm_or_ps(_mm_and_ps(sign.data(), SSE::_mm_setsignmask_ps()),
- _mm_and_ps(mag.data(), SSE::_mm_setabsmask_ps()));
- }
- Vc_INTRINSIC Vc_CONST SSE::double_v copysign(SSE::double_v mag, SSE::double_v sign)
- {
- return _mm_or_pd(_mm_and_pd(sign.data(), SSE::_mm_setsignmask_pd()),
- _mm_and_pd(mag.data(), SSE::_mm_setabsmask_pd()));
- }
- inline SSE::double_v frexp(const SSE::double_v &v,
- SimdArray<int, 2, Scalar::int_v, 1> *e)
- {
- const __m128i exponentBits = SSE::Const<double>::exponentMask().dataI();
- const __m128i exponentPart = _mm_and_si128(_mm_castpd_si128(v.data()), exponentBits);
- SSE::int_v exponent =
- _mm_sub_epi32(_mm_srli_epi64(exponentPart, 52), _mm_set1_epi32(0x3fe));
- const __m128d exponentMaximized = _mm_or_pd(v.data(), _mm_castsi128_pd(exponentBits));
- SSE::double_v ret = _mm_and_pd(
- exponentMaximized,
- _mm_load_pd(reinterpret_cast<const double *>(&SSE::c_general::frexpMask[0])));
- SSE::double_m zeroMask = v == SSE::double_v::Zero();
- ret(isnan(v) || !isfinite(v) || zeroMask) = v;
- exponent.setZero(zeroMask.data());
- (*e)[0] = exponent[0];
- (*e)[1] = exponent[2];
- return ret;
- }
- inline SSE::float_v frexp(const SSE::float_v &v, SimdArray<int, 4, SSE::int_v, 4> *e)
- {
- const __m128i exponentBits = SSE::Const<float>::exponentMask().dataI();
- const __m128i exponentPart = _mm_and_si128(_mm_castps_si128(v.data()), exponentBits);
- internal_data(*e) =
- _mm_sub_epi32(_mm_srli_epi32(exponentPart, 23), _mm_set1_epi32(0x7e));
- const __m128 exponentMaximized = _mm_or_ps(v.data(), _mm_castsi128_ps(exponentBits));
- SSE::float_v ret =
- _mm_and_ps(exponentMaximized, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu)));
- ret(isnan(v) || !isfinite(v) || v == SSE::float_v::Zero()) = v;
- e->setZero(v == SSE::float_v::Zero());
- return ret;
- }
- inline SSE::double_v ldexp(SSE::double_v::AsArg v,
- const SimdArray<int, 2, Scalar::int_v, 1> &_e)
- {
- SSE::int_v e = _mm_setr_epi32(_e[0], 0, _e[1], 0);
- e.setZero((v == SSE::double_v::Zero()).dataI());
- const __m128i exponentBits = _mm_slli_epi64(e.data(), 52);
- return _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(v.data()), exponentBits));
- }
- inline SSE::float_v ldexp(SSE::float_v::AsArg v,
- const SimdArray<int, 4, SSE::int_v, 4> &_e)
- {
- SSE::int_v e = internal_data(_e);
- e.setZero(simd_cast<SSE::int_m>(v == SSE::float_v::Zero()));
- return reinterpret_components_cast<SSE::float_v>(
- reinterpret_components_cast<SSE::int_v>(v) + (e << 23));
- }
- #ifdef Vc_IMPL_SSE4_1
- inline SSE::double_v trunc(SSE::double_v::AsArg v) { return _mm_round_pd(v.data(), 0x3); }
- inline SSE::float_v trunc(SSE::float_v::AsArg v) { return _mm_round_ps(v.data(), 0x3); }
- inline SSE::double_v floor(SSE::double_v::AsArg v) { return _mm_floor_pd(v.data()); }
- inline SSE::float_v floor(SSE::float_v::AsArg v) { return _mm_floor_ps(v.data()); }
- inline SSE::double_v ceil(SSE::double_v::AsArg v) { return _mm_ceil_pd(v.data()); }
- inline SSE::float_v ceil(SSE::float_v::AsArg v) { return _mm_ceil_ps(v.data()); }
- #else
- inline SSE::Vector<float> trunc(SSE::Vector<float> x)
- {
- const auto truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(x.data()));
- const auto no_fractional_values = _mm_castsi128_ps(_mm_cmplt_epi32(
- _mm_and_si128(_mm_castps_si128(x.data()), _mm_set1_epi32(0x7f800000u)),
- _mm_set1_epi32(0x4b000000)));
- return _mm_or_ps(_mm_andnot_ps(no_fractional_values, x.data()),
- _mm_and_ps(no_fractional_values, truncated));
- }
- inline SSE::Vector<double> trunc(SSE::Vector<double> x)
- {
- const auto abs_x = Vc::abs(x).data();
- const auto min_no_fractional_bits =
- _mm_castsi128_pd(_mm_set1_epi64x(0x4330000000000000ull));
- __m128d truncated =
- _mm_sub_pd(_mm_add_pd(abs_x, min_no_fractional_bits), min_no_fractional_bits);
- truncated = _mm_sub_pd(truncated,
- _mm_and_pd(_mm_cmplt_pd(abs_x, truncated), _mm_set1_pd(1.)));
- return _mm_or_pd(
- _mm_and_pd(_mm_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ull)), x.data()),
- truncated);
- }
- template <typename T> inline SSE::Vector<T> floor(SSE::Vector<T> x)
- {
- auto y = trunc(x);
- y(!(y == x) && x < 0) -= 1;
- return y;
- }
- template <typename T> inline SSE::Vector<T> ceil(SSE::Vector<T> x)
- {
- auto y = trunc(x);
- y(!(y == x || x < 0)) += 1;
- return y;
- }
- #endif
- template <typename T>
- Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> fma(Vector<T, VectorAbi::Sse> a,
- Vector<T, VectorAbi::Sse> b,
- Vector<T, VectorAbi::Sse> c)
- {
- SSE::VectorHelper<T>::fma(a.data(), b.data(), c.data());
- return a;
- }
- }
- #endif
- #ifndef Vc_SSE_SIMD_CAST_CALLER_TCC_
- #define Vc_SSE_SIMD_CAST_CALLER_TCC_
- namespace Vc_VERSIONED_NAMESPACE
- {
- #if Vc_IS_VERSION_1
- template <typename T>
- template <typename U>
- Vc_INTRINSIC Mask<T, VectorAbi::Sse>::Mask(U &&rhs, Common::enable_if_mask_converts_explicitly<T, U>)
- : Mask(Vc::simd_cast<Mask>(std::forward<U>(rhs)))
- {
- }
- #endif
- }
- #endif
- #endif
- #if defined(Vc_IMPL_AVX)
- #ifndef VC_AVX_HELPERIMPL_H_
- #define VC_AVX_HELPERIMPL_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <typename A>
- inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const float *, A);
- template <typename A>
- inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const short *, A);
- template <typename A>
- inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const ushort *, A);
- template <typename A>
- inline void deinterleave(AVX2::double_v &, AVX2::double_v &, const double *, A);
- template <typename A>
- inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const int *, A);
- template <typename A>
- inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const short *, A);
- template <typename A>
- inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const uint *, A);
- template <typename A>
- inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const ushort *, A);
- template <typename A>
- inline void deinterleave(AVX2::short_v &, AVX2::short_v &, const short *, A);
- template <typename A>
- inline void deinterleave(AVX2::ushort_v &, AVX2::ushort_v &, const ushort *, A);
- template <typename T, typename M, typename A>
- Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
- AVX2::Vector<T> &Vc_RESTRICT b,
- AVX2::Vector<T> &Vc_RESTRICT c,
- const M *Vc_RESTRICT memory,
- A align) Vc_ALWAYS_INLINE_R;
- template <typename T, typename M, typename A>
- Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
- AVX2::Vector<T> &Vc_RESTRICT b,
- AVX2::Vector<T> &Vc_RESTRICT c,
- AVX2::Vector<T> &Vc_RESTRICT d,
- const M *Vc_RESTRICT memory,
- A align) Vc_ALWAYS_INLINE_R;
- template <typename T, typename M, typename A>
- Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
- AVX2::Vector<T> &Vc_RESTRICT b,
- AVX2::Vector<T> &Vc_RESTRICT c,
- AVX2::Vector<T> &Vc_RESTRICT d,
- AVX2::Vector<T> &Vc_RESTRICT e,
- const M *Vc_RESTRICT memory,
- A align) Vc_ALWAYS_INLINE_R;
- template <typename T, typename M, typename A>
- Vc_ALWAYS_INLINE_L void deinterleave(
- AVX2::Vector<T> &Vc_RESTRICT a, AVX2::Vector<T> &Vc_RESTRICT b,
- AVX2::Vector<T> &Vc_RESTRICT c, AVX2::Vector<T> &Vc_RESTRICT d,
- AVX2::Vector<T> &Vc_RESTRICT e, AVX2::Vector<T> &Vc_RESTRICT f,
- const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R;
- template <typename T, typename M, typename A>
- Vc_ALWAYS_INLINE_L void deinterleave(
- AVX2::Vector<T> &Vc_RESTRICT a, AVX2::Vector<T> &Vc_RESTRICT b,
- AVX2::Vector<T> &Vc_RESTRICT c, AVX2::Vector<T> &Vc_RESTRICT d,
- AVX2::Vector<T> &Vc_RESTRICT e, AVX2::Vector<T> &Vc_RESTRICT f,
- AVX2::Vector<T> &Vc_RESTRICT g, AVX2::Vector<T> &Vc_RESTRICT h,
- const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R;
- Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Avx)
- {
- prefetchForOneRead(addr, VectorAbi::Sse());
- }
- Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Avx)
- {
- prefetchForModify(addr, VectorAbi::Sse());
- }
- Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Avx)
- {
- prefetchClose(addr, VectorAbi::Sse());
- }
- Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Avx)
- {
- prefetchMid(addr, VectorAbi::Sse());
- }
- Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Avx)
- {
- prefetchFar(addr, VectorAbi::Sse());
- }
- }
- }
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace AVX2
- {
- inline void deinterleave(double_v &Vc_RESTRICT a, double_v &Vc_RESTRICT b, double_v &Vc_RESTRICT c)
- {
- const m256d tmp0 = Mem::shuffle128<X0, Y1>(a.data(), b.data());
- const m256d tmp1 = Mem::shuffle128<X1, Y0>(a.data(), c.data());
- const m256d tmp2 = Mem::shuffle128<X0, Y1>(b.data(), c.data());
- a.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp0, tmp1);
- b.data() = Mem::shuffle<X1, Y0, X3, Y2>(tmp0, tmp2);
- c.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp1, tmp2);
- }
- inline void deinterleave(float_v &Vc_RESTRICT a, float_v &Vc_RESTRICT b, float_v &Vc_RESTRICT c)
- {
- const m256 ac0 = Mem::shuffle128<X0, Y0>(a.data(), c.data());
- const m256 ac1 = Mem::shuffle128<X1, Y1>(a.data(), c.data());
- m256 tmp0 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>( ac0, b.data());
- tmp0 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(tmp0, ac1);
- m256 tmp1 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>( ac0, b.data());
- tmp1 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(tmp1, ac1);
- m256 tmp2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>( ac0, b.data());
- tmp2 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>(tmp2, ac1);
- a.data() = Mem::permute<X0, X3, X2, X1>(tmp0);
- b.data() = Mem::permute<X1, X0, X3, X2>(tmp1);
- c.data() = Mem::permute<X2, X1, X0, X3>(tmp2);
- }
- inline void deinterleave(int_v &Vc_RESTRICT a, int_v &Vc_RESTRICT b, int_v &Vc_RESTRICT c)
- {
- deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
- reinterpret_cast<float_v &>(c));
- }
- inline void deinterleave(uint_v &Vc_RESTRICT a, uint_v &Vc_RESTRICT b, uint_v &Vc_RESTRICT c)
- {
- deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
- reinterpret_cast<float_v &>(c));
- }
- inline void deinterleave(Vector<short> &Vc_RESTRICT , Vector<short> &Vc_RESTRICT ,
- Vector<short> &Vc_RESTRICT )
- {
- return;
- }
- inline void deinterleave(Vector<unsigned short> &Vc_RESTRICT a, Vector<unsigned short> &Vc_RESTRICT b,
- Vector<unsigned short> &Vc_RESTRICT c)
- {
- deinterleave(reinterpret_cast<Vector<short> &>(a), reinterpret_cast<Vector<short> &>(b),
- reinterpret_cast<Vector<short> &>(c));
- }
- inline void deinterleave(Vector<float> &a, Vector<float> &b)
- {
- const m256 tmp0 = Reg::permute128<Y0, X0>(a.data(), b.data());
- const m256 tmp1 = Reg::permute128<Y1, X1>(a.data(), b.data());
- const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
- const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
- a.data() = _mm256_unpacklo_ps(tmp2, tmp3);
- b.data() = _mm256_unpackhi_ps(tmp2, tmp3);
- }
- inline void deinterleave(Vector<short> &a,
- Vector<short> &b)
- {
- auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
- auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
- auto v2 = AVX::unpacklo_epi16(v0, v1);
- auto v3 = AVX::unpackhi_epi16(v0, v1);
- v0 = AVX::unpacklo_epi16(v2, v3);
- v1 = AVX::unpackhi_epi16(v2, v3);
- a.data() = AVX::unpacklo_epi16(v0, v1);
- b.data() = AVX::unpackhi_epi16(v0, v1);
- }
- inline void deinterleave(Vector<ushort> &a, Vector<ushort> &b)
- {
- auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
- auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
- auto v2 = AVX::unpacklo_epi16(v0, v1);
- auto v3 = AVX::unpackhi_epi16(v0, v1);
- v0 = AVX::unpacklo_epi16(v2, v3);
- v1 = AVX::unpackhi_epi16(v2, v3);
- a.data() = AVX::unpacklo_epi16(v0, v1);
- b.data() = AVX::unpackhi_epi16(v0, v1);
- }
- }
- namespace Detail
- {
- template <typename Flags>
- inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const float *m, Flags align)
- {
- a.load(m, align);
- b.load(m + AVX2::float_v::Size, align);
- Vc::AVX2::deinterleave(a, b);
- }
- template <typename Flags>
- inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const short *m, Flags f)
- {
- using namespace Vc::AVX2;
- const auto tmp = Detail::load32(m, f);
- a.data() =
- _mm256_cvtepi32_ps(concat(_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
- _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)));
- b.data() = _mm256_cvtepi32_ps(
- concat(_mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16)));
- }
- template <typename Flags>
- inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const unsigned short *m, Flags f)
- {
- using namespace Vc::AVX2;
- const auto tmp = Detail::load32(m, f);
- a.data() = _mm256_cvtepi32_ps(
- concat(_mm_blend_epi16(lo128(tmp), _mm_setzero_si128(), 0xaa),
- _mm_blend_epi16(hi128(tmp), _mm_setzero_si128(), 0xaa)));
- b.data() = _mm256_cvtepi32_ps(
- concat(_mm_srli_epi32(lo128(tmp), 16), _mm_srli_epi32(hi128(tmp), 16)));
- }
- template <typename Flags>
- inline void deinterleave(AVX2::double_v &a, AVX2::double_v &b, const double *m, Flags align)
- {
- using namespace Vc::AVX2;
- a.load(m, align);
- b.load(m + AVX2::double_v::Size, align);
- m256d tmp0 = Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data());
- m256d tmp1 = Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data());
- a.data() = _mm256_unpacklo_pd(tmp0, tmp1);
- b.data() = _mm256_unpackhi_pd(tmp0, tmp1);
- }
- template <typename Flags>
- inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const int *m, Flags align)
- {
- using namespace AVX;
- a.load(m, align);
- b.load(m + AVX2::int_v::Size, align);
- const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
- const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
- const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
- const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
- a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3));
- b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3));
- }
- template <typename Flags>
- inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const short *m, Flags f)
- {
- using namespace Vc::AVX;
- const AVX2::short_v tmp0(m, f);
- const m256i tmp = tmp0.data();
- a.data() = concat(
- _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
- _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
- b.data() = concat(
- _mm_srai_epi32(lo128(tmp), 16),
- _mm_srai_epi32(hi128(tmp), 16));
- }
- template <typename Flags>
- inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned int *m, Flags align)
- {
- using namespace AVX;
- a.load(m, align);
- b.load(m + AVX2::uint_v::Size, align);
- const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
- const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
- const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
- const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
- a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3));
- b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3));
- }
- template <typename Flags>
- inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned short *m, Flags f)
- {
- using namespace Vc::AVX;
- const AVX2::ushort_v tmp0(m, f);
- const m256i tmp = tmp0.data();
- a.data() = concat(
- _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
- _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
- b.data() = concat(
- _mm_srai_epi32(lo128(tmp), 16),
- _mm_srai_epi32(hi128(tmp), 16));
- }
- template <typename Flags>
- inline void deinterleave(AVX2::short_v &a, AVX2::short_v &b, const short *m, Flags align)
- {
- a.load(m, align);
- b.load(m + AVX2::short_v::Size, align);
- Vc::AVX2::deinterleave(a, b);
- }
- template <typename Flags>
- inline void deinterleave(AVX2::ushort_v &a, AVX2::ushort_v &b, const unsigned short *m, Flags align)
- {
- a.load(m, align);
- b.load(m + AVX2::ushort_v::Size, align);
- Vc::AVX2::deinterleave(a, b);
- }
- template <typename T, typename M, typename Flags>
- Vc_ALWAYS_INLINE void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
- AVX2::Vector<T> &Vc_RESTRICT b,
- AVX2::Vector<T> &Vc_RESTRICT c,
- const M *Vc_RESTRICT memory, Flags align)
- {
- using V = AVX2::Vector<T>;
- a.load(&memory[0 * V::Size], align);
- b.load(&memory[1 * V::Size], align);
- c.load(&memory[2 * V::Size], align);
- Vc::AVX2::deinterleave(a, b, c);
- }
- }
- }
- #endif
- #ifndef VC_AVX_MATH_H_
- #define VC_AVX_MATH_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- #ifdef Vc_IMPL_AVX2
- Vc_ALWAYS_INLINE AVX2::int_v min(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_min_epi32(x.data(), y.data()); }
- Vc_ALWAYS_INLINE AVX2::uint_v min(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); }
- Vc_ALWAYS_INLINE AVX2::short_v min(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_min_epi16(x.data(), y.data()); }
- Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); }
- Vc_ALWAYS_INLINE AVX2::int_v max(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_max_epi32(x.data(), y.data()); }
- Vc_ALWAYS_INLINE AVX2::uint_v max(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); }
- Vc_ALWAYS_INLINE AVX2::short_v max(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_max_epi16(x.data(), y.data()); }
- Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); }
- #endif
- Vc_ALWAYS_INLINE AVX2::float_v min(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_min_ps(x.data(), y.data()); }
- Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); }
- Vc_ALWAYS_INLINE AVX2::float_v max(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_max_ps(x.data(), y.data()); }
- Vc_ALWAYS_INLINE AVX2::double_v max(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_max_pd(x.data(), y.data()); }
- template <typename T>
- Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> sqrt(const AVX2::Vector<T> &x)
- {
- return AVX::VectorHelper<T>::sqrt(x.data());
- }
- template <typename T>
- Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> rsqrt(const AVX2::Vector<T> &x)
- {
- return AVX::VectorHelper<T>::rsqrt(x.data());
- }
- template <typename T>
- Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> reciprocal(const AVX2::Vector<T> &x)
- {
- return AVX::VectorHelper<T>::reciprocal(x.data());
- }
- template <typename T>
- Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> round(const AVX2::Vector<T> &x)
- {
- return AVX::VectorHelper<T>::round(x.data());
- }
- Vc_INTRINSIC Vc_CONST AVX2::double_v abs(AVX2::double_v x)
- {
- return Detail::and_(x.data(), AVX::setabsmask_pd());
- }
- Vc_INTRINSIC Vc_CONST AVX2::float_v abs(AVX2::float_v x)
- {
- return Detail::and_(x.data(), AVX::setabsmask_ps());
- }
- #ifdef Vc_IMPL_AVX2
- Vc_INTRINSIC Vc_CONST AVX2::int_v abs(AVX2::int_v x)
- {
- return _mm256_abs_epi32(x.data());
- }
- Vc_INTRINSIC Vc_CONST AVX2::short_v abs(AVX2::short_v x)
- {
- return _mm256_abs_epi16(x.data());
- }
- #endif
- Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isfinite(const AVX2::double_v &x)
- {
- return AVX::cmpord_pd(x.data(), _mm256_mul_pd(Detail::zero<__m256d>(), x.data()));
- }
- Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isfinite(const AVX2::float_v &x)
- {
- return AVX::cmpord_ps(x.data(), _mm256_mul_ps(Detail::zero<__m256>(), x.data()));
- }
- Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isinf(const AVX2::double_v &x)
- {
- return _mm256_castsi256_pd(AVX::cmpeq_epi64(
- _mm256_castpd_si256(abs(x).data()),
- _mm256_castpd_si256(Detail::avx_broadcast(AVX::c_log<double>::d(1)))));
- }
- Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isinf(const AVX2::float_v &x)
- {
- return _mm256_castsi256_ps(
- AVX::cmpeq_epi32(_mm256_castps_si256(abs(x).data()),
- _mm256_castps_si256(Detail::avx_broadcast(AVX::c_log<float>::d(1)))));
- }
- Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isnan(const AVX2::double_v &x)
- {
- return AVX::cmpunord_pd(x.data(), x.data());
- }
- Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isnan(const AVX2::float_v &x)
- {
- return AVX::cmpunord_ps(x.data(), x.data());
- }
- Vc_INTRINSIC Vc_CONST AVX2::float_v copysign(AVX2::float_v mag, AVX2::float_v sign)
- {
- return _mm256_or_ps(_mm256_and_ps(sign.data(), AVX::setsignmask_ps()),
- _mm256_and_ps(mag.data(), AVX::setabsmask_ps()));
- }
- Vc_INTRINSIC Vc_CONST AVX2::double_v copysign(AVX2::double_v::AsArg mag,
- AVX2::double_v::AsArg sign)
- {
- return _mm256_or_pd(_mm256_and_pd(sign.data(), AVX::setsignmask_pd()),
- _mm256_and_pd(mag.data(), AVX::setabsmask_pd()));
- }
- inline AVX2::double_v frexp(AVX2::double_v::AsArg v, SimdArray<int, 4> *e)
- {
- const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
- const __m256d exponentPart = _mm256_and_pd(v.data(), exponentBits);
- auto lo = AVX::avx_cast<__m128i>(AVX::lo128(exponentPart));
- auto hi = AVX::avx_cast<__m128i>(AVX::hi128(exponentPart));
- lo = _mm_sub_epi32(_mm_srli_epi64(lo, 52), _mm_set1_epi64x(0x3fe));
- hi = _mm_sub_epi32(_mm_srli_epi64(hi, 52), _mm_set1_epi64x(0x3fe));
- SSE::int_v exponent = Mem::shuffle<X0, X2, Y0, Y2>(lo, hi);
- const __m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits);
- AVX2::double_v ret =
- _mm256_and_pd(exponentMaximized,
- _mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask)));
- const double_m zeroMask = v == AVX2::double_v::Zero();
- ret(isnan(v) || !isfinite(v) || zeroMask) = v;
- exponent.setZero(simd_cast<SSE::int_m>(zeroMask));
- internal_data(*e) = exponent;
- return ret;
- }
- #ifdef Vc_IMPL_AVX2
- inline SimdArray<double, 8> frexp(const SimdArray<double, 8> &v, SimdArray<int, 8> *e)
- {
- const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
- const __m256d w[2] = {internal_data(internal_data0(v)).data(),
- internal_data(internal_data1(v)).data()};
- const __m256i exponentPart[2] = {
- _mm256_castpd_si256(_mm256_and_pd(w[0], exponentBits)),
- _mm256_castpd_si256(_mm256_and_pd(w[1], exponentBits))};
- const __m256i lo = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[0], 52),
- _mm256_set1_epi32(0x3fe));
- const __m256i hi = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[1], 52),
- _mm256_set1_epi32(0x3fe));
- const __m256i a = _mm256_unpacklo_epi32(lo, hi);
- const __m256i b = _mm256_unpackhi_epi32(lo, hi);
- const __m256i tmp = _mm256_unpacklo_epi32(a, b);
- const __m256i exponent =
- AVX::concat(_mm_unpacklo_epi64(AVX::lo128(tmp), AVX::hi128(tmp)),
- _mm_unpackhi_epi64(AVX::lo128(tmp), AVX::hi128(tmp)));
- const __m256d exponentMaximized[2] = {_mm256_or_pd(w[0], exponentBits),
- _mm256_or_pd(w[1], exponentBits)};
- const auto frexpMask =
- _mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask));
- fixed_size_simd<double, 8> ret = {
- fixed_size_simd<double, 4>(
- AVX::double_v(_mm256_and_pd(exponentMaximized[0], frexpMask))),
- fixed_size_simd<double, 4>(
- AVX::double_v(_mm256_and_pd(exponentMaximized[1], frexpMask)))};
- const auto zeroMask = v == v.Zero();
- ret(isnan(v) || !isfinite(v) || zeroMask) = v;
- internal_data(*e) =
- Detail::andnot_(simd_cast<AVX2::int_m>(zeroMask).dataI(), exponent);
- return ret;
- }
- #endif
- namespace Detail
- {
- Vc_INTRINSIC AVX2::float_v::IndexType extractExponent(__m256 e)
- {
- SimdArray<uint, float_v::Size> exponentPart;
- const auto ee = AVX::avx_cast<__m256i>(e);
- #ifdef Vc_IMPL_AVX2
- exponentPart = AVX2::uint_v(ee);
- #else
- internal_data(internal_data0(exponentPart)) = AVX::lo128(ee);
- internal_data(internal_data1(exponentPart)) = AVX::hi128(ee);
- #endif
- return (exponentPart >> 23) - 0x7e;
- }
- }
- inline AVX2::float_v frexp(AVX2::float_v::AsArg v, SimdArray<int, 8> *e)
- {
- using namespace Detail;
- using namespace AVX2;
- const __m256 exponentBits = Const<float>::exponentMask().data();
- *e = extractExponent(and_(v.data(), exponentBits));
- const __m256 exponentMaximized = or_(v.data(), exponentBits);
- AVX2::float_v ret = _mm256_and_ps(exponentMaximized, avx_cast<__m256>(set1_epi32(0xbf7fffffu)));
- ret(isnan(v) || !isfinite(v) || v == AVX2::float_v::Zero()) = v;
- e->setZero(simd_cast<decltype(*e == *e)>(v == AVX2::float_v::Zero()));
- return ret;
- }
- inline AVX2::double_v ldexp(AVX2::double_v::AsArg v, const SimdArray<int, 4> &_e)
- {
- SSE::int_v e = internal_data(_e);
- e.setZero(simd_cast<SSE::int_m>(v == AVX2::double_v::Zero()));
- const __m256i exponentBits =
- AVX::concat(_mm_slli_epi64(_mm_unpacklo_epi32(e.data(), e.data()), 52),
- _mm_slli_epi64(_mm_unpackhi_epi32(e.data(), e.data()), 52));
- return AVX::avx_cast<__m256d>(
- AVX::add_epi64(AVX::avx_cast<__m256i>(v.data()), exponentBits));
- }
- inline AVX2::float_v ldexp(AVX2::float_v::AsArg v, SimdArray<int, 8> e)
- {
- e.setZero(simd_cast<decltype(e == e)>(v == AVX2::float_v::Zero()));
- e <<= 23;
- #ifdef Vc_IMPL_AVX2
- return {AVX::avx_cast<__m256>(
- AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
- AVX::lo128(internal_data(e).data())),
- _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
- AVX::hi128(internal_data(e).data()))))};
- #else
- return {AVX::avx_cast<__m256>(
- AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
- internal_data(internal_data0(e)).data()),
- _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
- internal_data(internal_data1(e)).data())))};
- #endif
- }
- Vc_ALWAYS_INLINE AVX2::float_v trunc(AVX2::float_v::AsArg v)
- {
- return _mm256_round_ps(v.data(), 0x3);
- }
- Vc_ALWAYS_INLINE AVX2::double_v trunc(AVX2::double_v::AsArg v)
- {
- return _mm256_round_pd(v.data(), 0x3);
- }
- Vc_ALWAYS_INLINE AVX2::float_v floor(AVX2::float_v::AsArg v)
- {
- return _mm256_floor_ps(v.data());
- }
- Vc_ALWAYS_INLINE AVX2::double_v floor(AVX2::double_v::AsArg v)
- {
- return _mm256_floor_pd(v.data());
- }
- Vc_ALWAYS_INLINE AVX2::float_v ceil(AVX2::float_v::AsArg v)
- {
- return _mm256_ceil_ps(v.data());
- }
- Vc_ALWAYS_INLINE AVX2::double_v ceil(AVX2::double_v::AsArg v)
- {
- return _mm256_ceil_pd(v.data());
- }
- template <typename T>
- Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx> fma(Vector<T, VectorAbi::Avx> a,
- Vector<T, VectorAbi::Avx> b,
- Vector<T, VectorAbi::Avx> c)
- {
- return Detail::fma(a.data(), b.data(), c.data(), T());
- }
- }
- #endif
- #ifndef Vc_AVX_SIMD_CAST_CALLER_TCC_
- #define Vc_AVX_SIMD_CAST_CALLER_TCC_
- namespace Vc_VERSIONED_NAMESPACE
- {
- #if Vc_IS_VERSION_1
- template <typename T>
- template <typename U, typename>
- Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(U &&x)
- : d(simd_cast<Vector>(std::forward<U>(x)).data())
- {
- }
- template <typename T>
- template <typename U>
- Vc_INTRINSIC Mask<T, VectorAbi::Avx>::Mask(U &&rhs,
- Common::enable_if_mask_converts_explicitly<T, U>)
- : Mask(simd_cast<Mask>(std::forward<U>(rhs)))
- {
- }
- #endif
- }
- #endif
- #endif
- #ifndef VC_COMMON_MATH_H_
- #define VC_COMMON_MATH_H_
- #define Vc_COMMON_MATH_H_INTERNAL 1
- #ifndef VC_COMMON_TRIGONOMETRIC_H_
- #define VC_COMMON_TRIGONOMETRIC_H_
- #ifdef Vc_HAVE_LIBMVEC
- extern "C" {
- __m128 _ZGVbN4v_sinf(__m128);
- __m128d _ZGVbN2v_sin(__m128d);
- __m128 _ZGVbN4v_cosf(__m128);
- __m128d _ZGVbN2v_cos(__m128d);
- __m256 _ZGVdN8v_sinf(__m256);
- __m256d _ZGVdN4v_sin(__m256d);
- __m256 _ZGVdN8v_cosf(__m256);
- __m256d _ZGVdN4v_cos(__m256d);
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template<Vc::Implementation Impl> struct MapImpl { enum Dummy { Value = Impl }; };
- template<> struct MapImpl<Vc::SSE42Impl> { enum Dummy { Value = MapImpl<Vc::SSE41Impl>::Value }; };
- template<Vc::Implementation Impl> using TrigonometricImplementation =
- ImplementationT<MapImpl<Impl>::Value
- #if defined(Vc_IMPL_XOP) && defined(Vc_IMPL_FMA4)
- + Vc::XopInstructions
- + Vc::Fma4Instructions
- #endif
- >;
- }
- namespace Common
- {
- template<typename Impl> struct Trigonometric
- {
- template<typename T> static T sin(const T &_x);
- template<typename T> static T cos(const T &_x);
- template<typename T> static void sincos(const T &_x, T *_sin, T *_cos);
- template<typename T> static T asin (const T &_x);
- template<typename T> static T atan (const T &_x);
- template<typename T> static T atan2(const T &y, const T &x);
- };
- }
- #if defined Vc_IMPL_SSE || defined DOXYGEN
- namespace Detail
- {
- template <typename T, typename Abi>
- using Trig = Common::Trigonometric<Detail::TrigonometricImplementation<
- (std::is_same<Abi, VectorAbi::Sse>::value
- ? SSE42Impl
- : std::is_same<Abi, VectorAbi::Avx>::value ? AVXImpl : ScalarImpl)>>;
- }
- #ifdef Vc_HAVE_LIBMVEC
- Vc_INTRINSIC __m128 sin_dispatch(__m128 x) { return ::_ZGVbN4v_sinf(x); }
- Vc_INTRINSIC __m128d sin_dispatch(__m128d x) { return ::_ZGVbN2v_sin (x); }
- Vc_INTRINSIC __m128 cos_dispatch(__m128 x) { return ::_ZGVbN4v_cosf(x); }
- Vc_INTRINSIC __m128d cos_dispatch(__m128d x) { return ::_ZGVbN2v_cos (x); }
- #ifdef Vc_IMPL_AVX
- Vc_INTRINSIC __m256 sin_dispatch(__m256 x) { return ::_ZGVdN8v_sinf(x); }
- Vc_INTRINSIC __m256d sin_dispatch(__m256d x) { return ::_ZGVdN4v_sin (x); }
- Vc_INTRINSIC __m256 cos_dispatch(__m256 x) { return ::_ZGVdN8v_cosf(x); }
- Vc_INTRINSIC __m256d cos_dispatch(__m256d x) { return ::_ZGVdN4v_cos (x); }
- #endif
- template <typename T, typename Abi>
- Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> sin(const Vector<T, Abi> &x)
- {
- return sin_dispatch(x.data());
- }
- template <typename T, typename Abi>
- Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> cos(const Vector<T, Abi> &x)
- {
- return cos_dispatch(x.data());
- }
- #else
- template <typename T, typename Abi>
- Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> sin(const Vector<T, Abi> &x)
- {
- return Detail::Trig<T, Abi>::sin(x);
- }
- template <typename T, typename Abi>
- Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> cos(const Vector<T, Abi> &x)
- {
- return Detail::Trig<T, Abi>::cos(x);
- }
- #endif
- template <typename T, typename Abi>
- Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> asin(const Vector<T, Abi> &x)
- {
- return Detail::Trig<T, Abi>::asin(x);
- }
- template <typename T, typename Abi>
- Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> atan(const Vector<T, Abi> &x)
- {
- return Detail::Trig<T, Abi>::atan(x);
- }
- template <typename T, typename Abi>
- Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> atan2(const Vector<T, Abi> &y,
- const Vector<T, Abi> &x)
- {
- return Detail::Trig<T, Abi>::atan2(y, x);
- }
- template <typename T, typename Abi>
- Vc_INTRINSIC void sincos(const Vector<T, Abi> &x,
- Vector<T, detail::not_fixed_size_abi<Abi>> *sin,
- Vector<T, Abi> *cos)
- {
- Detail::Trig<T, Abi>::sincos(x, sin, cos);
- }
- #endif
- }
- #endif
- #ifndef VC_COMMON_CONST_H_
- #define VC_COMMON_CONST_H_
- #include <type_traits>
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Detail
- {
- template <int exponent> constexpr double exponentToFloat(std::integral_constant<bool, true>);
- template <int exponent> constexpr double exponentToFloat(std::integral_constant<bool, false>);
- template <> constexpr double exponentToFloat<0>(std::integral_constant<bool, true>)
- {
- return 1.;
- }
- template <> constexpr double exponentToFloat<0>(std::integral_constant<bool, false>)
- {
- return 1.;
- }
- template <> constexpr double exponentToFloat<-32>(std::integral_constant<bool, true>)
- {
- return 1. / (65536. * 65536.);
- }
- template <> constexpr double exponentToFloat<32>(std::integral_constant<bool, false>)
- {
- return 65536. * 65536.;
- }
- template <> constexpr double exponentToFloat<-64>(std::integral_constant<bool, true>)
- {
- return 1. / (65536. * 65536. * 65536. * 65536.);
- }
- template <> constexpr double exponentToFloat<64>(std::integral_constant<bool, false>)
- {
- return 65536. * 65536. * 65536. * 65536.;
- }
- template <int exponent>
- constexpr double exponentToFloat(std::integral_constant<bool, false> negative)
- {
- return exponentToFloat<exponent - 1>(negative) * 2.0;
- }
- template <int exponent>
- constexpr double exponentToFloat(std::integral_constant<bool, true> negative)
- {
- return exponentToFloat<exponent + 1>(negative) * 0.5;
- }
- template <int sign, unsigned long long mantissa, int exponent> constexpr double doubleConstant()
- {
- return (static_cast<double>((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) /
- 0x0010000000000000ull) *
- exponentToFloat<exponent>(std::integral_constant<bool, (exponent < 0)>()) * sign;
- }
- template <int sign, unsigned int mantissa, int exponent> constexpr float floatConstant()
- {
- return (static_cast<float>((mantissa & 0x007fffffu) | 0x00800000u) / 0x00800000u) *
- static_cast<float>(
- exponentToFloat<exponent>(std::integral_constant<bool, (exponent < 0)>())) *
- sign;
- }
- }
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <class T, class Abi>
- SimdArray<int, Vector<T, Abi>::size()> fpclassify(const Vector<T, Abi> &x)
- {
- return SimdArray<int, Vector<T, Abi>::size()>(
- [&](std::size_t i) { return std::fpclassify(x[i]); });
- }
- template <class T, size_t N> SimdArray<int, N> fpclassify(const SimdArray<T, N> &x)
- {
- return SimdArray<int, N>([&](std::size_t i) { return std::fpclassify(x[i]); });
- }
- #ifdef Vc_IMPL_SSE
- #ifdef Vc_COMMON_MATH_H_INTERNAL
- enum LogarithmBase {
- BaseE, Base10, Base2
- };
- namespace Detail
- {
- template <typename T, typename Abi>
- using Const = typename std::conditional<std::is_same<Abi, VectorAbi::Avx>::value,
- AVX::Const<T>, SSE::Const<T>>::type;
- template<LogarithmBase Base>
- struct LogImpl
- {
- template<typename T, typename Abi> static Vc_ALWAYS_INLINE void log_series(Vector<T, Abi> &Vc_RESTRICT x, typename Vector<T, Abi>::AsArg exponent) {
- typedef Vector<T, Abi> V;
- typedef Detail::Const<T, Abi> C;
- const V x2 = x * x;
- #ifdef Vc_LOG_ILP
- V y2 = (C::P(6) * x2 + C::P(7) * x) + C::P(8);
- V y0 = (C::P(0) * x2 + C::P(1) * x) + C::P(2);
- V y1 = (C::P(3) * x2 + C::P(4) * x) + C::P(5);
- const V x3 = x2 * x;
- const V x6 = x3 * x3;
- const V x9 = x6 * x3;
- V y = (y0 * x9 + y1 * x6) + y2 * x3;
- #elif defined Vc_LOG_ILP2
- const V x3 = x2 * x;
- const V x4 = x2 * x2;
- const V x5 = x2 * x3;
- const V x6 = x3 * x3;
- const V x7 = x4 * x3;
- const V x8 = x4 * x4;
- const V x9 = x5 * x4;
- const V x10 = x5 * x5;
- const V x11 = x5 * x6;
- V y = C::P(0) * x11 + C::P(1) * x10 + C::P(2) * x9 + C::P(3) * x8 + C::P(4) * x7
- + C::P(5) * x6 + C::P(6) * x5 + C::P(7) * x4 + C::P(8) * x3;
- #else
- V y = C::P(0);
- Vc::Common::unrolled_loop<int, 1, 9>([&](int i) { y = y * x + C::P(i); });
- y *= x * x2;
- #endif
- switch (Base) {
- case BaseE:
- y += exponent * C::ln2_small();
- y -= x2 * C::_1_2();
- x += y;
- x += exponent * C::ln2_large();
- break;
- case Base10:
- y += exponent * C::ln2_small();
- y -= x2 * C::_1_2();
- x += y;
- x += exponent * C::ln2_large();
- x *= C::log10_e();
- break;
- case Base2:
- {
- const V x_ = x;
- x *= C::log2_e();
- y *= C::log2_e();
- y -= x_ * x * C::_1_2();
- x += y;
- x += exponent;
- break;
- }
- }
- }
- template <typename Abi>
- static Vc_ALWAYS_INLINE void log_series(Vector<double, Abi> &Vc_RESTRICT x,
- typename Vector<double, Abi>::AsArg exponent)
- {
- typedef Vector<double, Abi> V;
- typedef Detail::Const<double, Abi> C;
- const V x2 = x * x;
- V y = C::P(0);
- V y2 = C::Q(0) + x;
- Vc::Common::unrolled_loop<int, 1, 5>([&](int i) {
- y = y * x + C::P(i);
- y2 = y2 * x + C::Q(i);
- });
- y2 = x / y2;
- y = y * x + C::P(5);
- y = x2 * y * y2;
- switch (Base) {
- case BaseE:
- y += exponent * C::ln2_small();
- y -= x2 * C::_1_2();
- x += y;
- x += exponent * C::ln2_large();
- break;
- case Base10:
- y += exponent * C::ln2_small();
- y -= x2 * C::_1_2();
- x += y;
- x += exponent * C::ln2_large();
- x *= C::log10_e();
- break;
- case Base2:
- {
- const V x_ = x;
- x *= C::log2_e();
- y *= C::log2_e();
- y -= x_ * x * C::_1_2();
- x += y;
- x += exponent;
- break;
- }
- }
- }
- template <typename T, typename Abi, typename V = Vector<T, Abi>>
- static inline Vector<T, Abi> calc(V _x)
- {
- typedef typename V::Mask M;
- typedef Detail::Const<T, Abi> C;
- V x(_x);
- const M invalidMask = x < V::Zero();
- const M infinityMask = x == V::Zero();
- const M denormal = x <= C::min();
- x(denormal) *= V(Vc::Detail::doubleConstant<1, 0, 54>());
- V exponent = Detail::exponent(x.data());
- exponent(denormal) -= 54;
- x.setZero(C::exponentMask());
- x = Detail::operator|(x,
- C::_1_2());
- const M smallX = x < C::_1_sqrt2();
- x(smallX) += x;
- x -= V::One();
- exponent(!smallX) += V::One();
- log_series(x, exponent);
- x.setQnan(invalidMask);
- x(infinityMask) = C::neginf();
- return x;
- }
- };
- }
- template <typename T, typename Abi>
- Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log(
- const Vector<T, Abi> &x)
- {
- return Detail::LogImpl<BaseE>::calc<T, Abi>(x);
- }
- template <typename T, typename Abi>
- Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log10(
- const Vector<T, Abi> &x)
- {
- return Detail::LogImpl<Base10>::calc<T, Abi>(x);
- }
- template <typename T, typename Abi>
- Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log2(
- const Vector<T, Abi> &x)
- {
- return Detail::LogImpl<Base2>::calc<T, Abi>(x);
- }
- #endif
- #ifdef Vc_COMMON_MATH_H_INTERNAL
- constexpr float log2_e = 1.44269504088896341f;
- constexpr float MAXLOGF = 88.72283905206835f;
- constexpr float MINLOGF = -103.278929903431851103f;
- constexpr float MAXNUMF = 3.4028234663852885981170418348451692544e38f;
- template <typename Abi, typename = enable_if<std::is_same<Abi, VectorAbi::Sse>::value ||
- std::is_same<Abi, VectorAbi::Avx>::value>>
- inline Vector<float, detail::not_fixed_size_abi<Abi>> exp(Vector<float, Abi> x)
- {
- using V = Vector<float, Abi>;
- typedef typename V::Mask M;
- typedef Detail::Const<float, Abi> C;
- const M overflow = x > MAXLOGF;
- const M underflow = x < MINLOGF;
- V z = floor(C::log2_e() * x + 0.5f);
- const auto n = static_cast<Vc::SimdArray<int, V::Size>>(z);
- x -= z * C::ln2_large();
- x -= z * C::ln2_small();
- z = ((((( 1.9875691500E-4f * x
- + 1.3981999507E-3f) * x
- + 8.3334519073E-3f) * x
- + 4.1665795894E-2f) * x
- + 1.6666665459E-1f) * x
- + 5.0000001201E-1f) * (x * x)
- + x
- + 1.0f;
- x = ldexp(z, n);
- x(overflow) = std::numeric_limits<typename V::EntryType>::infinity();
- x.setZero(underflow);
- return x;
- }
- #endif
- #ifdef Vc_IMPL_AVX
- inline AVX::double_v exp(AVX::double_v _x)
- {
- AVX::Vector<double> x = _x;
- typedef AVX::Vector<double> V;
- typedef V::Mask M;
- typedef AVX::Const<double> C;
- const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>();
- const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>();
- V px = floor(C::log2_e() * x + 0.5);
- __m128i tmp = _mm256_cvttpd_epi32(px.data());
- const SimdArray<int, V::Size> n = SSE::int_v{tmp};
- x -= px * C::ln2_large();
- x -= px * C::ln2_small();
- const double P[] = {
- Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(),
- Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(),
- Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>()
- };
- const double Q[] = {
- Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(),
- Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(),
- Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(),
- Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>()
- };
- const V x2 = x * x;
- px = x * ((P[0] * x2 + P[1]) * x2 + P[2]);
- x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px);
- x = V::One() + 2.0 * x;
- x = ldexp(x, n);
- x(overflow) = std::numeric_limits<double>::infinity();
- x.setZero(underflow);
- return x;
- }
- #endif
- inline SSE::double_v exp(SSE::double_v::AsArg _x) {
- SSE::Vector<double> x = _x;
- typedef SSE::Vector<double> V;
- typedef V::Mask M;
- typedef SSE::Const<double> C;
- const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>();
- const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>();
- V px = floor(C::log2_e() * x + 0.5);
- SimdArray<int, V::Size> n;
- _mm_storel_epi64(reinterpret_cast<__m128i *>(&n), _mm_cvttpd_epi32(px.data()));
- x -= px * C::ln2_large();
- x -= px * C::ln2_small();
- const double P[] = {
- Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(),
- Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(),
- Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>()
- };
- const double Q[] = {
- Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(),
- Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(),
- Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(),
- Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>()
- };
- const V x2 = x * x;
- px = x * ((P[0] * x2 + P[1]) * x2 + P[2]);
- x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px);
- x = V::One() + 2.0 * x;
- x = ldexp(x, n);
- x(overflow) = std::numeric_limits<double>::infinity();
- x.setZero(underflow);
- return x;
- }
- #endif
- }
- #undef Vc_COMMON_MATH_H_INTERNAL
- #endif
- #ifdef isfinite
- #undef isfinite
- #endif
- #ifdef isnan
- #undef isnan
- #endif
- #ifndef VC_COMMON_VECTORTUPLE_H_
- #define VC_COMMON_VECTORTUPLE_H_
- namespace Vc_VERSIONED_NAMESPACE
- {
- namespace Common
- {
- template<size_t StructSize, typename V, typename I, bool Readonly = true> struct InterleavedMemoryReadAccess;
- template <int Length, typename V> class VectorReferenceArray
- {
- typedef typename V::EntryType T;
- typedef V &Vc_RESTRICT Reference;
- std::array<V * Vc_RESTRICT, Length> r;
- typedef make_index_sequence<Length> IndexSequence;
- template <typename VV, std::size_t... Indexes>
- constexpr VectorReferenceArray<Length + 1, VV> appendOneReference(
- VV &a, index_sequence<Indexes...>) const
- {
- return {*r[Indexes]..., a};
- }
- template <typename A, std::size_t... Indexes>
- Vc_INTRINSIC void callDeinterleave(const A &access, index_sequence<Indexes...>) const
- {
- access.deinterleave(*r[Indexes]...);
- }
- public:
- template <typename... Us, typename = enable_if<(sizeof...(Us) == Length)>>
- constexpr VectorReferenceArray(Us &&... args)
- : r{{std::addressof(std::forward<Us>(args))...}}
- {
- }
- template <typename VV, typename = enable_if<!std::is_const<V>::value &&
- std::is_same<VV, V>::value>>
- Vc_DEPRECATED("build the tuple with Vc::tie instead") constexpr VectorReferenceArray<
- Length + 1, V>
- operator,(VV &a) const &&
- {
- return appendOneReference(a, IndexSequence());
- }
- Vc_DEPRECATED("build the tuple with Vc::tie instead") constexpr VectorReferenceArray<
- Length + 1, const V>
- operator,(const V &a) const &&
- {
- return appendOneReference(a, IndexSequence());
- }
- template <size_t StructSize, typename I, bool RO>
- Vc_ALWAYS_INLINE enable_if<(Length <= StructSize), void> operator=(
- const InterleavedMemoryReadAccess<StructSize, V, I, RO> &access) &&
- {
- callDeinterleave(access, IndexSequence());
- }
- template <size_t StructSize, typename I, bool RO>
- enable_if<(Length > StructSize), void> operator=(
- const InterleavedMemoryReadAccess<StructSize, V, I, RO> &access) && =
- delete;
- template <typename... Inputs> void operator=(TransposeProxy<Inputs...> &&proxy) &&
- {
- transpose_impl(TransposeTag<Length, sizeof...(Inputs)>(), &r[0], proxy);
- }
- template <typename T, typename IndexVector, typename Scale, bool Flag>
- void operator=(SubscriptOperation<T, IndexVector, Scale, Flag> &&sub) &&
- {
- const auto &args = std::move(sub).gatherArguments();
- Common::InterleavedMemoryReadAccess<1, V, Traits::decay<decltype(args.indexes)>>
- deinterleaver(args.address, args.indexes);
- callDeinterleave(deinterleaver, IndexSequence());
- }
- Vc_ALWAYS_INLINE Reference operator[](std::size_t i) { return *r[i]; }
- };
- }
- template <typename T, typename Abi>
- Vc_DEPRECATED("build the tuple with Vc::tie instead")
- constexpr Common::VectorReferenceArray<2, Vc::Vector<T, Abi>>
- operator,(Vc::Vector<T, Abi> &a, Vc::Vector<T, Abi> &b)
- {
- return {a, b};
- }
- template <typename T, typename Abi>
- Vc_DEPRECATED("build the tuple with Vc::tie instead")
- constexpr Common::VectorReferenceArray<2, const Vc::Vector<T, Abi>>
- operator,(const Vc::Vector<T, Abi> &a, const Vc::Vector<T, Abi> &b)
- {
- return {a, b};
- }
- template <typename V, typename... Vs>
- constexpr Common::VectorReferenceArray<sizeof...(Vs) + 1,
- typename std::remove_reference<V>::type>
- tie(V &&a, Vs &&... b)
- {
- return {std::forward<V>(a), std::forward<Vs>(b)...};
- }
- }
- #endif
- #ifndef VC_COMMON_IIF_H_
- #define VC_COMMON_IIF_H_
- #ifndef VC_TYPE_TRAITS_
- #define VC_TYPE_TRAITS_
- #include <type_traits>
- namespace Vc_VERSIONED_NAMESPACE
- {
- using Traits::is_simd_mask;
- using Traits::is_simd_vector;
- using Traits::is_integral;
- using Traits::is_floating_point;
- using Traits::is_arithmetic;
- using Traits::is_signed;
- using Traits::is_unsigned;
- template<typename T>
- struct memory_alignment : public std::integral_constant<size_t, alignof(T)> {};
- template<> struct memory_alignment<short_v> : public std::integral_constant<size_t, short_v::MemoryAlignment> {};
- template<> struct memory_alignment<ushort_v> : public std::integral_constant<size_t, ushort_v::MemoryAlignment> {};
- }
- #endif
- namespace Vc_VERSIONED_NAMESPACE
- {
- template <typename Mask, typename T>
- Vc_ALWAYS_INLINE enable_if<is_simd_mask<Mask>::value && is_simd_vector<T>::value, T> iif(
- const Mask &condition, const T &trueValue, const T &falseValue)
- {
- T result(falseValue);
- Vc::where(condition) | result = trueValue;
- return result;
- }
- template <typename Mask, typename T>
- enable_if<is_simd_mask<Mask>::value && !is_simd_vector<T>::value, T> iif(
- const Mask &, const T &, const T &) = delete;
- template<typename T> constexpr T iif (bool condition, const T &trueValue, const T &falseValue)
- {
- return condition ? trueValue : falseValue;
- }
- }
- #endif
- #ifndef Vc_NO_STD_FUNCTIONS
- namespace std
- {
- using Vc::min;
- using Vc::max;
- using Vc::abs;
- using Vc::asin;
- using Vc::atan;
- using Vc::atan2;
- using Vc::ceil;
- using Vc::cos;
- using Vc::exp;
- using Vc::fma;
- using Vc::trunc;
- using Vc::floor;
- using Vc::frexp;
- using Vc::ldexp;
- using Vc::log;
- using Vc::log10;
- using Vc::log2;
- using Vc::round;
- using Vc::sin;
- using Vc::sqrt;
- using Vc::isfinite;
- using Vc::isnan;
- }
- #endif
- Vc_RESET_DIAGNOSTICS
- #endif
|